From 51f4670f60f56023707e793f2cd2a5d422463b64 Mon Sep 17 00:00:00 2001 From: amirmoghi3 Date: Fri, 25 Dec 2020 16:33:49 +0330 Subject: [PATCH] Initial Commit --- .history/app_20201225152542.py | 0 .history/app_20201225152608.py | 21 ++++++++ .history/app_20201225152810.py | 21 ++++++++ .history/app_20201225152934.py | 22 ++++++++ .history/app_20201225153009.py | 22 ++++++++ .history/app_20201225153016.py | 22 ++++++++ .history/app_20201225153034.py | 23 ++++++++ .history/app_20201225153041.py | 24 +++++++++ .history/app_20201225153057.py | 24 +++++++++ .history/app_20201225153107.py | 24 +++++++++ .history/app_20201225153134.py | 24 +++++++++ .history/app_20201225153219.py | 22 ++++++++ .history/app_20201225153710.py | 24 +++++++++ .history/app_20201225153815.py | 24 +++++++++ .history/app_20201225153906.py | 22 ++++++++ .history/app_20201225153913.py | 23 ++++++++ .history/app_20201225153932.py | 24 +++++++++ .history/app_20201225154020.py | 24 +++++++++ .history/app_20201225154028.py | 24 +++++++++ .history/app_20201225154033.py | 24 +++++++++ .history/app_20201225154115.py | 24 +++++++++ .history/app_20201225154141.py | 24 +++++++++ .history/app_20201225154223.py | 23 ++++++++ .history/app_20201225154225.py | 22 ++++++++ .history/app_20201225154640.py | 54 +++++++++++++++++++ .history/app_20201225161344.py | 48 +++++++++++++++++ .history/app_20201225161529.py | 48 +++++++++++++++++ .history/app_20201225161542.py | 48 +++++++++++++++++ .history/gini_20201225161914.py | 48 +++++++++++++++++ .history/gini_20201225162123.py | 90 +++++++++++++++++++++++++++++++ .history/gini_20201225162607.py | 90 +++++++++++++++++++++++++++++++ .history/gini_20201225162804.py | 90 +++++++++++++++++++++++++++++++ .history/gini_20201225162859.py | 90 +++++++++++++++++++++++++++++++ .history/gini_20201225163004.py | 92 ++++++++++++++++++++++++++++++++ .history/gini_20201225163146.py | 92 ++++++++++++++++++++++++++++++++ .history/gini_20201225163343.py | 92 ++++++++++++++++++++++++++++++++ .vscode/launch.json | 15 ++++++ .vscode/settings.json | 3 ++ Train.xlsx | Bin 0 -> 23668 bytes gini.py | 92 ++++++++++++++++++++++++++++++++ 40 files changed, 1523 insertions(+) create mode 100644 .history/app_20201225152542.py create mode 100644 .history/app_20201225152608.py create mode 100644 .history/app_20201225152810.py create mode 100644 .history/app_20201225152934.py create mode 100644 .history/app_20201225153009.py create mode 100644 .history/app_20201225153016.py create mode 100644 .history/app_20201225153034.py create mode 100644 .history/app_20201225153041.py create mode 100644 .history/app_20201225153057.py create mode 100644 .history/app_20201225153107.py create mode 100644 .history/app_20201225153134.py create mode 100644 .history/app_20201225153219.py create mode 100644 .history/app_20201225153710.py create mode 100644 .history/app_20201225153815.py create mode 100644 .history/app_20201225153906.py create mode 100644 .history/app_20201225153913.py create mode 100644 .history/app_20201225153932.py create mode 100644 .history/app_20201225154020.py create mode 100644 .history/app_20201225154028.py create mode 100644 .history/app_20201225154033.py create mode 100644 .history/app_20201225154115.py create mode 100644 .history/app_20201225154141.py create mode 100644 .history/app_20201225154223.py create mode 100644 .history/app_20201225154225.py create mode 100644 .history/app_20201225154640.py create mode 100644 .history/app_20201225161344.py create mode 100644 .history/app_20201225161529.py create mode 100644 .history/app_20201225161542.py create mode 100644 .history/gini_20201225161914.py create mode 100644 .history/gini_20201225162123.py create mode 100644 .history/gini_20201225162607.py create mode 100644 .history/gini_20201225162804.py create mode 100644 .history/gini_20201225162859.py create mode 100644 .history/gini_20201225163004.py create mode 100644 .history/gini_20201225163146.py create mode 100644 .history/gini_20201225163343.py create mode 100644 .vscode/launch.json create mode 100644 .vscode/settings.json create mode 100644 Train.xlsx create mode 100644 gini.py diff --git a/.history/app_20201225152542.py b/.history/app_20201225152542.py new file mode 100644 index 0000000..e69de29 diff --git a/.history/app_20201225152608.py b/.history/app_20201225152608.py new file mode 100644 index 0000000..49e2a2b --- /dev/null +++ b/.history/app_20201225152608.py @@ -0,0 +1,21 @@ +def gini_index(groups, classes): + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) +print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225152810.py b/.history/app_20201225152810.py new file mode 100644 index 0000000..3c4ace5 --- /dev/null +++ b/.history/app_20201225152810.py @@ -0,0 +1,21 @@ +def gini_index(groups, classes): + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) +print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225152934.py b/.history/app_20201225152934.py new file mode 100644 index 0000000..852e422 --- /dev/null +++ b/.history/app_20201225152934.py @@ -0,0 +1,22 @@ +def gini_index(groups, classes): + # count all samples at split point + print(classes) + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) +print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225153009.py b/.history/app_20201225153009.py new file mode 100644 index 0000000..5a31d44 --- /dev/null +++ b/.history/app_20201225153009.py @@ -0,0 +1,22 @@ +def gini_index(groups, classes): + print(classes) + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) +print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225153016.py b/.history/app_20201225153016.py new file mode 100644 index 0000000..9384f10 --- /dev/null +++ b/.history/app_20201225153016.py @@ -0,0 +1,22 @@ +def gini_index(groups, classes): + + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) +print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225153034.py b/.history/app_20201225153034.py new file mode 100644 index 0000000..ee7278c --- /dev/null +++ b/.history/app_20201225153034.py @@ -0,0 +1,23 @@ +def gini_index(groups, classes): + print(groups) + print(classes) + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) +print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225153041.py b/.history/app_20201225153041.py new file mode 100644 index 0000000..7ac7a9f --- /dev/null +++ b/.history/app_20201225153041.py @@ -0,0 +1,24 @@ +def gini_index(groups, classes): + + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + print(groups) + print(classes) + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) +print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225153057.py b/.history/app_20201225153057.py new file mode 100644 index 0000000..42f868f --- /dev/null +++ b/.history/app_20201225153057.py @@ -0,0 +1,24 @@ +def gini_index(groups, classes): + + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + print(groups) + print(classes) + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) +print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225153107.py b/.history/app_20201225153107.py new file mode 100644 index 0000000..7ac7a9f --- /dev/null +++ b/.history/app_20201225153107.py @@ -0,0 +1,24 @@ +def gini_index(groups, classes): + + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + print(groups) + print(classes) + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) +print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225153134.py b/.history/app_20201225153134.py new file mode 100644 index 0000000..5056bad --- /dev/null +++ b/.history/app_20201225153134.py @@ -0,0 +1,24 @@ +def gini_index(groups, classes): + + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + print(groups) + print(classes) + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) +print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225153219.py b/.history/app_20201225153219.py new file mode 100644 index 0000000..29bb710 --- /dev/null +++ b/.history/app_20201225153219.py @@ -0,0 +1,22 @@ +def gini_index(groups, classes): + + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) +print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225153710.py b/.history/app_20201225153710.py new file mode 100644 index 0000000..752a1a7 --- /dev/null +++ b/.history/app_20201225153710.py @@ -0,0 +1,24 @@ +def gini_index(groups, classes): + + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + for row in group: + print(row[-1]) + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) +print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225153815.py b/.history/app_20201225153815.py new file mode 100644 index 0000000..752a1a7 --- /dev/null +++ b/.history/app_20201225153815.py @@ -0,0 +1,24 @@ +def gini_index(groups, classes): + + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + for row in group: + print(row[-1]) + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) +print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225153906.py b/.history/app_20201225153906.py new file mode 100644 index 0000000..c5550ad --- /dev/null +++ b/.history/app_20201225153906.py @@ -0,0 +1,22 @@ +def gini_index(groups, classes): + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +# test Gini values +print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) +print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225153913.py b/.history/app_20201225153913.py new file mode 100644 index 0000000..386c39f --- /dev/null +++ b/.history/app_20201225153913.py @@ -0,0 +1,23 @@ +def gini_index(groups, classes): + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +# test Gini values +print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) +print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225153932.py b/.history/app_20201225153932.py new file mode 100644 index 0000000..05c1067 --- /dev/null +++ b/.history/app_20201225153932.py @@ -0,0 +1,24 @@ +def gini_index(groups, classes): + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + for row in group: + print(row[-1]) + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +# test Gini values +print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) +print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225154020.py b/.history/app_20201225154020.py new file mode 100644 index 0000000..05c1067 --- /dev/null +++ b/.history/app_20201225154020.py @@ -0,0 +1,24 @@ +def gini_index(groups, classes): + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + for row in group: + print(row[-1]) + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +# test Gini values +print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) +print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225154028.py b/.history/app_20201225154028.py new file mode 100644 index 0000000..7573e2c --- /dev/null +++ b/.history/app_20201225154028.py @@ -0,0 +1,24 @@ +def gini_index(groups, classes): + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + for row in group: + print(row[-1]) + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +# test Gini values +print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) +print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225154033.py b/.history/app_20201225154033.py new file mode 100644 index 0000000..05c1067 --- /dev/null +++ b/.history/app_20201225154033.py @@ -0,0 +1,24 @@ +def gini_index(groups, classes): + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + for row in group: + print(row[-1]) + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +# test Gini values +print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) +print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225154115.py b/.history/app_20201225154115.py new file mode 100644 index 0000000..0f5fa34 --- /dev/null +++ b/.history/app_20201225154115.py @@ -0,0 +1,24 @@ +def gini_index(groups, classes): + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + for row in group: + print(row[-1]) + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +# test Gini values +print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) +print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225154141.py b/.history/app_20201225154141.py new file mode 100644 index 0000000..f77015a --- /dev/null +++ b/.history/app_20201225154141.py @@ -0,0 +1,24 @@ +def gini_index(groups, classes): + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + for row in group: + print(row[-1]) + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +# test Gini values +print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) +print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225154223.py b/.history/app_20201225154223.py new file mode 100644 index 0000000..99f846b --- /dev/null +++ b/.history/app_20201225154223.py @@ -0,0 +1,23 @@ +def gini_index(groups, classes): + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +# test Gini values +print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) +print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225154225.py b/.history/app_20201225154225.py new file mode 100644 index 0000000..c5550ad --- /dev/null +++ b/.history/app_20201225154225.py @@ -0,0 +1,22 @@ +def gini_index(groups, classes): + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +# test Gini values +print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) +print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225154640.py b/.history/app_20201225154640.py new file mode 100644 index 0000000..81873ae --- /dev/null +++ b/.history/app_20201225154640.py @@ -0,0 +1,54 @@ +def test_split(index, value, dataset): + left, right = list(), list() + for row in dataset: + if row[index] < value: + left.append(row) + else: + right.append(row) + return left, right + +# Calculate the Gini index for a split dataset +def gini_index(groups, classes): + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +# Select the best split point for a dataset +def get_split(dataset): + class_values = list(set(row[-1] for row in dataset)) + b_index, b_value, b_score, b_groups = 999, 999, 999, None + for index in range(len(dataset[0])-1): + for row in dataset: + groups = test_split(index, row[index], dataset) + gini = gini_index(groups, class_values) + print('X%d < %.3f Gini=%.3f' % ((index+1), row[index], gini)) + if gini < b_score: + b_index, b_value, b_score, b_groups = index, row[index], gini, groups + return {'index':b_index, 'value':b_value, 'groups':b_groups} + +dataset = [[2.771244718,1.784783929,0], + [1.728571309,1.169761413,0], + [3.678319846,2.81281357,0], + [3.961043357,2.61995032,0], + [2.999208922,2.209014212,0], + [7.497545867,3.162953546,1], + [9.00220326,3.339047188,1], + [7.444542326,0.476683375,1], + [10.12493903,3.234550982,1], + [6.642287351,3.319983761,1]] +split = get_split(dataset) +print('Split: [X%d < %.3f]' % ((split['index']+1), split['value'])) \ No newline at end of file diff --git a/.history/app_20201225161344.py b/.history/app_20201225161344.py new file mode 100644 index 0000000..076cce8 --- /dev/null +++ b/.history/app_20201225161344.py @@ -0,0 +1,48 @@ +import pandas as pd + + +def test_split(index, value, dataset): + left, right = list(), list() + for row in dataset: + if row[index] < value: + left.append(row) + else: + right.append(row) + return left, right + +# Calculate the Gini index for a split dataset +def gini_index(groups, classes): + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +# Select the best split point for a dataset +def get_split(dataset): + class_values = list(set(row[-1] for row in dataset)) + b_index, b_value, b_score, b_groups = 999, 999, 999, None + for index in range(len(dataset[0])-1): + for row in dataset: + groups = test_split(index, row[index], dataset) + gini = gini_index(groups, class_values) + print('X%d < %.3f Gini=%.3f' % ((index+1), row[index], gini)) + if gini < b_score: + b_index, b_value, b_score, b_groups = index, row[index], gini, groups + return {'index':b_index, 'value':b_value, 'groups':b_groups} + +dataset = pd.read_excel('dataset2.xls', sheet_name="forestfires").to_numpy() +split = get_split(dataset) +print('Split: [X%d < %.3f]' % ((split['index']+1), split['value'])) \ No newline at end of file diff --git a/.history/app_20201225161529.py b/.history/app_20201225161529.py new file mode 100644 index 0000000..d5303c5 --- /dev/null +++ b/.history/app_20201225161529.py @@ -0,0 +1,48 @@ +import pandas as pd + + +def test_split(index, value, dataset): + left, right = list(), list() + for row in dataset: + if row[index] < value: + left.append(row) + else: + right.append(row) + return left, right + +# Calculate the Gini index for a split dataset +def gini_index(groups, classes): + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +# Select the best split point for a dataset +def get_split(dataset): + class_values = list(set(row[-1] for row in dataset)) + b_index, b_value, b_score, b_groups = 999, 999, 999, None + for index in range(len(dataset[0])-1): + for row in dataset: + groups = test_split(index, row[index], dataset) + gini = gini_index(groups, class_values) + print('X%d < %.3f Gini=%.3f' % ((index+1), row[index], gini)) + if gini < b_score: + b_index, b_value, b_score, b_groups = index, row[index], gini, groups + return {'index':b_index, 'value':b_value, 'groups':b_groups} + +dataset = pd.read_excel('train.xls', sheet_name="Sheet1").to_numpy() +split = get_split(dataset) +print('Split: [X%d < %.3f]' % ((split['index']+1), split['value'])) \ No newline at end of file diff --git a/.history/app_20201225161542.py b/.history/app_20201225161542.py new file mode 100644 index 0000000..8b04984 --- /dev/null +++ b/.history/app_20201225161542.py @@ -0,0 +1,48 @@ +import pandas as pd + + +def test_split(index, value, dataset): + left, right = list(), list() + for row in dataset: + if row[index] < value: + left.append(row) + else: + right.append(row) + return left, right + +# Calculate the Gini index for a split dataset +def gini_index(groups, classes): + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +# Select the best split point for a dataset +def get_split(dataset): + class_values = list(set(row[-1] for row in dataset)) + b_index, b_value, b_score, b_groups = 999, 999, 999, None + for index in range(len(dataset[0])-1): + for row in dataset: + groups = test_split(index, row[index], dataset) + gini = gini_index(groups, class_values) + print('X%d < %.3f Gini=%.3f' % ((index+1), row[index], gini)) + if gini < b_score: + b_index, b_value, b_score, b_groups = index, row[index], gini, groups + return {'index':b_index, 'value':b_value, 'groups':b_groups} + +dataset = pd.read_excel('Train.xlsx', sheet_name="Sheet1").to_numpy() +split = get_split(dataset) +print('Split: [X%d < %.3f]' % ((split['index']+1), split['value'])) \ No newline at end of file diff --git a/.history/gini_20201225161914.py b/.history/gini_20201225161914.py new file mode 100644 index 0000000..8b04984 --- /dev/null +++ b/.history/gini_20201225161914.py @@ -0,0 +1,48 @@ +import pandas as pd + + +def test_split(index, value, dataset): + left, right = list(), list() + for row in dataset: + if row[index] < value: + left.append(row) + else: + right.append(row) + return left, right + +# Calculate the Gini index for a split dataset +def gini_index(groups, classes): + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +# Select the best split point for a dataset +def get_split(dataset): + class_values = list(set(row[-1] for row in dataset)) + b_index, b_value, b_score, b_groups = 999, 999, 999, None + for index in range(len(dataset[0])-1): + for row in dataset: + groups = test_split(index, row[index], dataset) + gini = gini_index(groups, class_values) + print('X%d < %.3f Gini=%.3f' % ((index+1), row[index], gini)) + if gini < b_score: + b_index, b_value, b_score, b_groups = index, row[index], gini, groups + return {'index':b_index, 'value':b_value, 'groups':b_groups} + +dataset = pd.read_excel('Train.xlsx', sheet_name="Sheet1").to_numpy() +split = get_split(dataset) +print('Split: [X%d < %.3f]' % ((split['index']+1), split['value'])) \ No newline at end of file diff --git a/.history/gini_20201225162123.py b/.history/gini_20201225162123.py new file mode 100644 index 0000000..f0964db --- /dev/null +++ b/.history/gini_20201225162123.py @@ -0,0 +1,90 @@ +import pandas as pd +def test_split(index, value, dataset): + left, right = list(), list() + for row in dataset: + if row[index] < value: + left.append(row) + else: + right.append(row) + return left, right + +# Calculate the Gini index for a split dataset +def gini_index(groups, classes): + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +# Select the best split point for a dataset +def get_split(dataset): + class_values = list(set(row[-1] for row in dataset)) + b_index, b_value, b_score, b_groups = 999, 999, 999, None + for index in range(len(dataset[0])-1): + for row in dataset: + groups = test_split(index, row[index], dataset) + gini = gini_index(groups, class_values) + if gini < b_score: + b_index, b_value, b_score, b_groups = index, row[index], gini, groups + return {'index':b_index, 'value':b_value, 'groups':b_groups} + +# Create a terminal node value +def to_terminal(group): + outcomes = [row[-1] for row in group] + return max(set(outcomes), key=outcomes.count) + +# Create child splits for a node or make terminal +def split(node, max_depth, min_size, depth): + left, right = node['groups'] + del(node['groups']) + # check for a no split + if not left or not right: + node['left'] = node['right'] = to_terminal(left + right) + return + # check for max depth + if depth >= max_depth: + node['left'], node['right'] = to_terminal(left), to_terminal(right) + return + # process left child + if len(left) <= min_size: + node['left'] = to_terminal(left) + else: + node['left'] = get_split(left) + split(node['left'], max_depth, min_size, depth+1) + # process right child + if len(right) <= min_size: + node['right'] = to_terminal(right) + else: + node['right'] = get_split(right) + split(node['right'], max_depth, min_size, depth+1) + +# Build a decision tree +def build_tree(train, max_depth, min_size): + root = get_split(train) + split(root, max_depth, min_size, 1) + return root + +# Print a decision tree +def print_tree(node, depth=0): + if isinstance(node, dict): + print('%s[X%d < %.3f]' % ((depth*' ', (node['index']+1), node['value']))) + print_tree(node['left'], depth+1) + print_tree(node['right'], depth+1) + else: + print('%s[%s]' % ((depth*' ', node))) + +dataset = pd.read_excel('Train.xlsx', sheet_name="Sheet1").to_numpy() +tree = build_tree(dataset, 1, 1) +print_tree(tree) \ No newline at end of file diff --git a/.history/gini_20201225162607.py b/.history/gini_20201225162607.py new file mode 100644 index 0000000..65bac69 --- /dev/null +++ b/.history/gini_20201225162607.py @@ -0,0 +1,90 @@ +import pandas as pd +def test_split(index, value, dataset): + left, right = list(), list() + for row in dataset: + if row[index] < value: + left.append(row) + else: + right.append(row) + return left, right + +# Calculate the Gini index for a split dataset +def gini_index(groups, classes): + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +# Select the best split point for a dataset +def get_split(dataset): + class_values = list(set(row[-1] for row in dataset)) + b_index, b_value, b_score, b_groups = 999, 999, 999, None + for index in range(len(dataset[0])-1): + for row in dataset: + groups = test_split(index, row[index], dataset) + gini = gini_index(groups, class_values) + if gini < b_score: + b_index, b_value, b_score, b_groups = index, row[index], gini, groups + return {'index':b_index, 'value':b_value, 'groups':b_groups} + +# Create a terminal node value +def to_terminal(group): + outcomes = [row[-1] for row in group] + return max(set(outcomes), key=outcomes.count) + +# Create child splits for a node or make terminal +def split(node, max_depth, min_size, depth): + left, right = node['groups'] + del(node['groups']) + # check for a no split + if not left or not right: + node['left'] = node['right'] = to_terminal(left + right) + return + # check for max depth + if depth >= max_depth: + node['left'], node['right'] = to_terminal(left), to_terminal(right) + return + # process left child + if len(left) <= min_size: + node['left'] = to_terminal(left) + else: + node['left'] = get_split(left) + split(node['left'], max_depth, min_size, depth+1) + # process right child + if len(right) <= min_size: + node['right'] = to_terminal(right) + else: + node['right'] = get_split(right) + split(node['right'], max_depth, min_size, depth+1) + +# Build a decision tree +def build_tree(train, max_depth, min_size): + root = get_split(train) + split(root, max_depth, min_size, 1) + return root + +# Print a decision tree +def print_tree(node, depth=0): + if isinstance(node, dict): + print('%s[X%d < %.3f]' % ((depth*' ', (node['index']+1), node['value']))) + print_tree(node['left'], depth+1) + print_tree(node['right'], depth+1) + else: + print('%s[%s]' % ((depth*' ', node))) + +dataset = pd.read_excel('Train.xlsx', sheet_name="Sheet1").to_numpy() +tree = build_tree(dataset, 5, 1) +print_tree(tree) \ No newline at end of file diff --git a/.history/gini_20201225162804.py b/.history/gini_20201225162804.py new file mode 100644 index 0000000..65bac69 --- /dev/null +++ b/.history/gini_20201225162804.py @@ -0,0 +1,90 @@ +import pandas as pd +def test_split(index, value, dataset): + left, right = list(), list() + for row in dataset: + if row[index] < value: + left.append(row) + else: + right.append(row) + return left, right + +# Calculate the Gini index for a split dataset +def gini_index(groups, classes): + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +# Select the best split point for a dataset +def get_split(dataset): + class_values = list(set(row[-1] for row in dataset)) + b_index, b_value, b_score, b_groups = 999, 999, 999, None + for index in range(len(dataset[0])-1): + for row in dataset: + groups = test_split(index, row[index], dataset) + gini = gini_index(groups, class_values) + if gini < b_score: + b_index, b_value, b_score, b_groups = index, row[index], gini, groups + return {'index':b_index, 'value':b_value, 'groups':b_groups} + +# Create a terminal node value +def to_terminal(group): + outcomes = [row[-1] for row in group] + return max(set(outcomes), key=outcomes.count) + +# Create child splits for a node or make terminal +def split(node, max_depth, min_size, depth): + left, right = node['groups'] + del(node['groups']) + # check for a no split + if not left or not right: + node['left'] = node['right'] = to_terminal(left + right) + return + # check for max depth + if depth >= max_depth: + node['left'], node['right'] = to_terminal(left), to_terminal(right) + return + # process left child + if len(left) <= min_size: + node['left'] = to_terminal(left) + else: + node['left'] = get_split(left) + split(node['left'], max_depth, min_size, depth+1) + # process right child + if len(right) <= min_size: + node['right'] = to_terminal(right) + else: + node['right'] = get_split(right) + split(node['right'], max_depth, min_size, depth+1) + +# Build a decision tree +def build_tree(train, max_depth, min_size): + root = get_split(train) + split(root, max_depth, min_size, 1) + return root + +# Print a decision tree +def print_tree(node, depth=0): + if isinstance(node, dict): + print('%s[X%d < %.3f]' % ((depth*' ', (node['index']+1), node['value']))) + print_tree(node['left'], depth+1) + print_tree(node['right'], depth+1) + else: + print('%s[%s]' % ((depth*' ', node))) + +dataset = pd.read_excel('Train.xlsx', sheet_name="Sheet1").to_numpy() +tree = build_tree(dataset, 5, 1) +print_tree(tree) \ No newline at end of file diff --git a/.history/gini_20201225162859.py b/.history/gini_20201225162859.py new file mode 100644 index 0000000..9fab08c --- /dev/null +++ b/.history/gini_20201225162859.py @@ -0,0 +1,90 @@ +import pandas as pd +def test_split(index, value, dataset): + left, right = list(), list() + for row in dataset: + if row[index] < value: + left.append(row) + else: + right.append(row) + return left, right + +# Calculate the Gini index for a split dataset +def gini_index(groups, classes): + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +# Select the best split point for a dataset +def get_split(dataset): + class_values = list(set(row[-1] for row in dataset)) + b_index, b_value, b_score, b_groups = 999, 999, 999, None + for index in range(len(dataset[0])-1): + for row in dataset: + groups = test_split(index, row[index], dataset) + gini = gini_index(groups, class_values) + if gini < b_score: + b_index, b_value, b_score, b_groups = index, row[index], gini, groups + return {'index':b_index, 'value':b_value, 'groups':b_groups} + +# Create a terminal node value +def to_terminal(group): + outcomes = [row[-1] for row in group] + return max(set(outcomes), key=outcomes.count) + +# Create child splits for a node or make terminal +def split(node, max_depth, min_size, depth): + left, right = node['groups'] + del(node['groups']) + # check for a no split + if not left or not right: + node['left'] = node['right'] = to_terminal(left + right) + return + # check for max depth + if depth >= max_depth: + node['left'], node['right'] = to_terminal(left), to_terminal(right) + return + # process left child + if len(left) <= min_size: + node['left'] = to_terminal(left) + else: + node['left'] = get_split(left) + split(node['left'], max_depth, min_size, depth+1) + # process right child + if len(right) <= min_size: + node['right'] = to_terminal(right) + else: + node['right'] = get_split(right) + split(node['right'], max_depth, min_size, depth+1) + +# Build a decision tree +def build_tree(train, max_depth, min_size): + root = get_split(train) + split(root, max_depth, min_size, 1) + return root + +# Print a decision tree +def print_tree(node, depth=0): + if isinstance(node, dict): + print('%s[X%d < %.3f]' % ((depth*' ', (node['index']+1), node['value']))) + print_tree(node['left'], depth+1) + print_tree(node['right'], depth+1) + else: + print('%s[%s]' % ((depth*' ', node))) + +dataset = pd.read_excel('Train.xlsx', sheet_name="Sheet1").to_numpy() +tree = build_tree(dataset, 5, 0) +print_tree(tree) \ No newline at end of file diff --git a/.history/gini_20201225163004.py b/.history/gini_20201225163004.py new file mode 100644 index 0000000..184a8fd --- /dev/null +++ b/.history/gini_20201225163004.py @@ -0,0 +1,92 @@ +import pandas as pd +def test_split(index, value, dataset): + left, right = list(), list() + for row in dataset: + if row[index] < value: + left.append(row) + else: + right.append(row) + return left, right + +# Calculate the Gini index for a split dataset +def gini_index(groups, classes): + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +# Select the best split point for a dataset +def get_split(dataset): + class_values = list(set(row[-1] for row in dataset)) + b_index, b_value, b_score, b_groups = 999, 999, 999, None + for index in range(len(dataset[0])-1): + for row in dataset: + groups = test_split(index, row[index], dataset) + gini = gini_index(groups, class_values) + if gini < b_score: + b_index, b_value, b_score, b_groups = index, row[index], gini, groups + return {'index':b_index, 'value':b_value, 'groups':b_groups} + +# Create a terminal node value +def to_terminal(group): + outcomes = [row[-1] for row in group] + return max(set(outcomes), key=outcomes.count) + +# Create child splits for a node or make terminal +def split(node, max_depth, min_size, depth): + left, right = node['groups'] + del(node['groups']) + # check for a no split + if not left or not right: + node['left'] = node['right'] = to_terminal(left + right) + return + # check for max depth + if depth >= max_depth: + node['left'], node['right'] = to_terminal(left), to_terminal(right) + return + # process left child + if len(left) <= min_size: + node['left'] = to_terminal(left) + else: + node['left'] = get_split(left) + split(node['left'], max_depth, min_size, depth+1) + # process right child + if len(right) <= min_size: + node['right'] = to_terminal(right) + else: + node['right'] = get_split(right) + split(node['right'], max_depth, min_size, depth+1) + +# Build a decision tree +def build_tree(train, max_depth, min_size): + root = get_split(train) + split(root, max_depth, min_size, 1) + return root + +# Print a decision tree +def print_tree(node, depth=0): + if isinstance(node, dict): + print('%s[X%d < %.3f]' % ((depth*' ', (node['index']+1), node['value']))) + print_tree(node['left'], depth+1) + print_tree(node['right'], depth+1) + else: + print('%s[%s]' % ((depth*' ', node))) + + +if __name__ == "__main__": + dataset = pd.read_excel('Train.xlsx', sheet_name="Sheet1").to_numpy() + tree = build_tree(dataset, 5, 0) + print_tree(tree) \ No newline at end of file diff --git a/.history/gini_20201225163146.py b/.history/gini_20201225163146.py new file mode 100644 index 0000000..ccfe822 --- /dev/null +++ b/.history/gini_20201225163146.py @@ -0,0 +1,92 @@ +import pandas as pd +def test_split(index, value, dataset): + left, right = list(), list() + for row in dataset: + if row[index] < value: + left.append(row) + else: + right.append(row) + return left, right + +# Calculate the Gini index for a split dataset +def gini_index(groups, classes): + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +# Select the best split point for a dataset +def get_split(dataset): + class_values = list(set(row[-1] for row in dataset)) + b_index, b_value, b_score, b_groups = 999, 999, 999, None + for index in range(len(dataset[0])-1): + for row in dataset: + groups = test_split(index, row[index], dataset) + gini = gini_index(groups, class_values) + if gini < b_score: + b_index, b_value, b_score, b_groups = index, row[index], gini, groups + return {'index':b_index, 'value':b_value, 'groups':b_groups} + +# Create a terminal node value +def to_terminal(group): + outcomes = [row[-1] for row in group] + return max(set(outcomes), key=outcomes.count) + +# Create child splits for a node or make terminal +def split(node, max_depth, min_size, depth): + left, right = node['groups'] + del(node['groups']) + # check for a no split + if not left or not right: + node['left'] = node['right'] = to_terminal(left + right) + return + # check for max depth + if depth >= max_depth: + node['left'], node['right'] = to_terminal(left), to_terminal(right) + return + # process left child + if len(left) <= min_size: + node['left'] = to_terminal(left) + else: + node['left'] = get_split(left) + split(node['left'], max_depth, min_size, depth+1) + # process right child + if len(right) <= min_size: + node['right'] = to_terminal(right) + else: + node['right'] = get_split(right) + split(node['right'], max_depth, min_size, depth+1) + +# Build a decision tree +def build_tree(train, max_depth, min_size): + root = get_split(train) + split(root, max_depth, min_size, 1) + return root + +# Print a decision tree +def print_tree(node, depth=0): + if isinstance(node, dict): + print('%s[X%d < %.3f]' % ((depth*' ', (node['index']+1), node['value']))) + print_tree(node['left'], depth+1) + print_tree(node['right'], depth+1) + else: + print('%s[%s]' % ((depth*' ', node))) + + +if __name__ == "__main__": + dataset = pd.read_excel('Train.xlsx', sheet_name="Sheet1").to_numpy() + tree = build_tree(dataset, 1, 1) + print_tree(tree) \ No newline at end of file diff --git a/.history/gini_20201225163343.py b/.history/gini_20201225163343.py new file mode 100644 index 0000000..ccfe822 --- /dev/null +++ b/.history/gini_20201225163343.py @@ -0,0 +1,92 @@ +import pandas as pd +def test_split(index, value, dataset): + left, right = list(), list() + for row in dataset: + if row[index] < value: + left.append(row) + else: + right.append(row) + return left, right + +# Calculate the Gini index for a split dataset +def gini_index(groups, classes): + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +# Select the best split point for a dataset +def get_split(dataset): + class_values = list(set(row[-1] for row in dataset)) + b_index, b_value, b_score, b_groups = 999, 999, 999, None + for index in range(len(dataset[0])-1): + for row in dataset: + groups = test_split(index, row[index], dataset) + gini = gini_index(groups, class_values) + if gini < b_score: + b_index, b_value, b_score, b_groups = index, row[index], gini, groups + return {'index':b_index, 'value':b_value, 'groups':b_groups} + +# Create a terminal node value +def to_terminal(group): + outcomes = [row[-1] for row in group] + return max(set(outcomes), key=outcomes.count) + +# Create child splits for a node or make terminal +def split(node, max_depth, min_size, depth): + left, right = node['groups'] + del(node['groups']) + # check for a no split + if not left or not right: + node['left'] = node['right'] = to_terminal(left + right) + return + # check for max depth + if depth >= max_depth: + node['left'], node['right'] = to_terminal(left), to_terminal(right) + return + # process left child + if len(left) <= min_size: + node['left'] = to_terminal(left) + else: + node['left'] = get_split(left) + split(node['left'], max_depth, min_size, depth+1) + # process right child + if len(right) <= min_size: + node['right'] = to_terminal(right) + else: + node['right'] = get_split(right) + split(node['right'], max_depth, min_size, depth+1) + +# Build a decision tree +def build_tree(train, max_depth, min_size): + root = get_split(train) + split(root, max_depth, min_size, 1) + return root + +# Print a decision tree +def print_tree(node, depth=0): + if isinstance(node, dict): + print('%s[X%d < %.3f]' % ((depth*' ', (node['index']+1), node['value']))) + print_tree(node['left'], depth+1) + print_tree(node['right'], depth+1) + else: + print('%s[%s]' % ((depth*' ', node))) + + +if __name__ == "__main__": + dataset = pd.read_excel('Train.xlsx', sheet_name="Sheet1").to_numpy() + tree = build_tree(dataset, 1, 1) + print_tree(tree) \ No newline at end of file diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..17e15f2 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,15 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Python: Current File", + "type": "python", + "request": "launch", + "program": "${file}", + "console": "integratedTerminal" + } + ] +} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..c8a63ca --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "python.pythonPath": "C:\\Users\\user\\AppData\\Local\\Programs\\Python\\Python38-32\\python.exe" +} \ No newline at end of file diff --git a/Train.xlsx b/Train.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..87b7d9821f201b1b18cbd67ef4c316f5d4c49b79 GIT binary patch literal 23668 zcmeEt`6HBV`@SWVB^9zpSwnVX$zBN!DU5yJ3S}3AtRYL;%~(RRk3D5y%D$y&knC%S zEZMj3b&sCsect!;e*S{bw;#qd?)$pW>%5NRIFIw3x(6pbMU6*{cNz~5?+RX~W=ro~ zd_26}xft<#JbrXJ;}pTwbUXrZ{r~^-e>ejLv0ZnY z1kWjMD<9mP;R_orlRphx4rsejeU-&@O2r&2fE?*NlA+ozm48iRD_&~eJK`cElG-gc zKyCLtw+=Iej?~33+PYKo-aIY5Vm+D$ovGR9K|=GLpzPc4E$gx&;>LioOWFl+Zs{}r zZY0{Q+xSecL}m38e~7(Qr+}o$_CQ1@ZU0QwlNKi=mhx!rm&kfmJ(fhnt=Y+G}Z90O7 z=+>5}*Sq#P&*L3YDe}%?}w}F!u4g*M?QKx z5s|kJiK81(2JONR{Lt@-SBI^7GjiI$79)S-+c>iwr1{YG$kvdvpAxc>I6HqYHu6@| zUU)ItVU0#Q@epgn^a3S7(D5-L-o1aL;X8yN+i&n2RlqVbKtmH}t4A&Z{Lt(Fr{Di! z&i!NS1u<&sO@idXzmyL`+PATzQRi;J-4&E;u4wuAsf6{irAfb|P zozFq%_fhE=i|tporbG+EFHlRdy>~ARM(sJdk??RjC){=_m~1}(1iOjdN>sXoy!xar zk_S_n`x@Rc&UU+h?N;fzUVa^hGgMhj!53u`{qFG?xp5ODe?E7h`*q zeFqb=7bxVOORE0tOQdOcF@Nx_$h-Z~mCaoSgyus@n-cR>=j*IU6U#@98@G}hs1CjP z?xwVBu!>&T5$;uL<4E81)h`!a>w1N37C+O3XC2p=E` z1Kw$Tk4FM0JaKn$wljBdu!E}AU!1`QB@3+bfBR2CbeAnovV)ES(NATbkY>fTe~Y{E zzU2__N+e5j!ZYib0bi5JfKhp6v2Uk~h(}%6AG^y-+XbLBdij&@n(9FNoPRb z%B%{n?AV}-36V53ZXFFuqnG`Jy! zL#eTAr$=TjCN)TVkIBh6-0p0P%%%FL(vC{(n6|PO3H6Kg()%;(wEqns-_mX(l>j!< zftP_~7U1Kb$SBZKbNDVuUPn3Pd(L(EY>}KT*NgKt4`a#6?vX2^-lGhu^kG&3t5ZX&yd+FVrpG3`izN@@iQPUE?)N!1rE}{KuXB5!QR`?OqtbcZ`kY*# zZ&OvgRk$WKqPAm*>`RISYgIt&&0j0buvgA70J|{ef#}H+M-TJ>Sij5U)!?6^@WDPZPV-v zW3v7$+4!kHu4x$2dfRicKP$CcE~8;!i*=aLseRJQk4<-9F+6OKIh~t8rjccz^!4sAhLE##k*rJ&^=R1lmr&HR{C=@Qft{45 zHTQZK7f+Wc0vRETDsB3gMK>bQJZ>#m{w5QaFw@{@H!XzL!xmw0%ZuYPzs9o$E)ms? zi_D#qznVN*=h72!&R``mbL)<{>W4IaMwLss)=G~o=n4xO*oi`$9PhBtl5U?S(>o%2 zo;g?gI6$K-_t6HDO>jwO&b?^fpgT*3syz2VyOawrSUPA?!deTTlwl^s zf^d>Xv@B%d;ApB&X{Q?cf zhIgcC_d&40I+ymEcI$&;fdiGAv#d?DyPw>7lJ@9+POMyupiRkzNjJ%fcTsL^Wa4Ue%Hfb%Mj{c0`JKq<5GRZ z-^hD)KV?>Lid3N{ZgHa{isgPq^mf7R0<{WtaZcMIu|d-@ zXNvCDU3Yu&K6CX9u@%nRC)r=-(`yobv-|t^LY??{tSGI(=!-m; zXPS3)`<`HEd5QvK&ojna4nJ`?OOlnnil^^<2scsNT~ZO0cDCKFo+S)lH@pRY6#V4Hqgkr-dBgHMFkXg8 z{>}Mr(dmruziSHZdwWY6uGW+ft-w4t^Zf^Z4h~AVd+&;@lE!`89#{=jIBhHn-cLRa zS7&(l8TxoK5a#`+Zvja`SS){@qhpexdWvMzQSv*D<;Nc{X~x#;6y78-3fzmL!2FE4 zsjAB=Hme`8QsRn-cP3Lgs`-34eQNr7OQ+{oC6z_$Z)aC0 z?dWKfjjT2{gYc3pyU^Sxg!j)8Q=x7(x@_Cz zR=QAYzmmQWM-pq%t*D#HJx0ywmp@CJ z1aGW$cP6q&zdZC~xe>-Xf1OHfhgoF&9Fo#WzN6J~oOS5*_O~03RIZ2|E%xW$!s>jb zwGoNCviv_znPmxI-VaV{W0%w%Ru8Q{zWeoUzN`LMDelsGeW{#GZ>kl!s}nM!>O~eF z4+hAFqz-*@##fGw8^r6n=8K1e?ga+ZV%*+6i4^A_<$Bilv^3oVg~3XNtcxy2mus1i|=)yH`!UO2mRm`zT-Uap)TNciUsZ z$n7DC@1@!pXZNH~5~YlX#XFcMLyw=6UlI*78tSYa9=waSN2|hriQ#UaoKhE*|?VA^1GQw6xUbo7jyAH_Yig z1@&t4T`_7=h92?rcHFEv<^G)28lRgb?Q0Z9L;6e4C%qdz^YdU*I8|ZAv1&!pm9*^v zTeP*n!P3>PYqJ?49&wU}p?9;1h0aU3T@C$Slybzw_%K8CmJ?<`{@j?pGhbAE#S5lW zqvXP=s;kOB%m(!{bKV-<+fn5$P)BSJl60j$Iy*TQ+1mCA|5c9oxlHtU?j4esXF>?m zZRdJ~{Hc24qiA;B^95(iFf82pOr68$UnyQKzFLib;K-4|dwtPAk~ z&-B<0D^sHd@3qO$`>J{qVTmmdUr6R%XTW4UXDAR}(Ed(r{LsiQ&n+r_b;KBSqvZuX$jbCKbKdaz5 zhEfnaJVz;b7XM2X?N{PLPYh$6{^qCjna%1R4PNr)Xi(E1rjdPei z6&mzN^UZYE1Hv-LAH_2m7Y?j3ys{o&8tFv|32vy$I-B@;(G)K?{MQ;bb|KN|Kn@R& znUw&K?xco6KHue`m6fZD0PY8_kj1_F=4{v&uQ)E1aqR!WJ-jkVPod*Wu2gSDj(b=D zd$8nH<>9j9dU21pOh$=zY$*;~*SXX=E729C$wWUk(~gfO55pT|e2&&v`rBBHFOD14 zdav!QE$5_-KRwtCK^Fsn*YB)v_Ozvq*B<{~TLYhZ9IUVR${p=D)E!Ot&o_AQ zF0ZXEk*V$M%!Z&Eo*wS)ZxA~-smWo{c(eA7IfE>9k+B6p5 z-N{*B>Tw4YEv~_4T_84b0qZ*SFkudz?$sHYvR ze;qpW)L@TC&%l?e#K&iCV_LxBXzysr*Jtm|Q}6912NT$gzRBnQ!Og?omLpw+`0#lew@h+bx9#d(%r5>OIHzf4~pENBZ&@H|%|n zEH$+xz8jn+C zl>R9y7F^zZhGuI(Ir&xVcCFQj&5lQ-ya+wA20piE?@N)>9fj-9;5N9K&RI2R`zN<>Fs>N#5+BA zA(=N5))!1tf!m`0+H;abDCf+z<{KCLMXZ7@B&0aJ5m+4!$(DZV^~#&&MJ4%Y2jjUW zyC%l5cTnU#s~=Xstahw^Tjf~Y84wyH5R2?&6(O7awZI}u%;)UYu}m|4eL~mM!o$wv zfrrD+IgbkwrpCf+lMhQcYe> zBx6s=C62QFY96$bQR>kWod8f8WLG(SA{Zo7}5XM)4i7K${FgGa-(_9paT7a_t@?OzPj_T6U%9wcHpNI@J7e4T`bW=)Aln7 zx+70Z_-Hnbg6nU5^85Gek2JXYWx;XTXyaBlzT+|xcM|vF=SG~7GTKAs7KB&S7N_QwyugCvwlAmVika5yh z=j!lkuzJ7n3f;E#PUG?$n>6w?3KNfmP7ksnm2SA**^~i6Sz&O9EIFKoLjQ_kV(Uve zE@EjPt6HL;OmkT zmIu2BZ$r8BP6=J>_7G~)X{xa7-c{=+v*VX6F!Q!FBu*K-MKypv3T}gBqmEwIasS~k zSJQ(gTIL2~CSR}fB{iXmazV{DJseYP)4~&9U#r-u{sC(0n&}o30BZN&Kn+5g^>EB{ zQ~GmJYT64_{X6@clw?0Odsx?^CwVIz&dB;E%mu+vOn;L1>vf0iU+exM0F>q(2?(X` zR2yYU@Qu=>#@OVCvf52RhmxEuKJ_t-(T_38(#ftnakh#~w~(7qjK>Jg>j()HRLzud zPVBG<61@Cn*rCbO3Y%^vccA!1{tJ0vI;>=$wxe!3w*qdcdJ12y`dFJg7xRXpwarg6 zNFr;B`|pb>a75fegSA!laF;gM7R>c|mvI6`vMpn$I*Pw+v) z^sjRPtSYUVYChrX-Yr<@i-rFwG(z4K(#QlYZ`O?xpm<1~-Mxn46Fhr1fT z>9p5EO(dzpNw_TrwH14}gHu1hu@3fVce2N`yQtc)|3wc@;s%kFRZX>XPW-N;c#q1i z6Rs7jo$N0Mbs-k>&`12mH`fzE5b~b2u*a8rfqdOVs!J0Z37;MhfKVYMQ);y`O=rd1f0)sbK*y_bJysrZ{_0N6GCj$9jb{{ zPlc9weN_w^R~Y7q#0KZs#QeF6L%@%JmL*Ur=^`VzAW||}ofw0r+Qzvgk@%2Cs*sVt zHhQ8@(ms$4qU(*Ib&136k{76Ik>s$)Y3cjz%N!Lt5q7OFB;kd*6q9rh2_jK~r*jmCjvq%Z+q zWRTBvUT{NhwmGe^YO)vT6npl`DMlE)eE>N;)1CJ3cTceGyq_?!q0?NOIoHY#^XD2j zMrQ_p=LNzV>}2}{zKB5ryum!JMKeex0Y!P++drtorNoqR3ZN==Ku1r zL6L7E5oT&lisKX_exs>0zd`KOQyC5Zu|9Ruf|a!Y{ZB-MHa56@sFVY;T0Gh zoypRAZ0t9j8Q#6QxuJP)#8|k9ZVueN2G=?|*(P$>mPT2mWJpAW@`Mv}o@ z8-lwg6s2{d%kh)2U&Dg-kiTgqoc@yHDC|8aL^chXAusOku7~zLZ8ijv4N{hd(8{1H z#r+|Agp!EW)E4j-*#*H4{!$@{$A^!p`bMUD6)!=xqc~V1YmWOxBCeQj?D(&>d*B4; z{y<~w;Kv8YE+ZAlD=&uE#wiKai>@56Qn{b-@lMzBrRmhQMh{DAp;Yiq&IHH_ehn`g z)jQ%W+l0euoxeowzb*YRTXjShh#5q4zZV~CYCUc*dk|5`F&GmwA2Z5M53AbzsnXv+ zlt+%pLJM2(*zP1mNP59NQM8|PU)5tyOCaH$Cnt$iZK^zRaC27ypt7INZMBpa`?m=M zHb9{cJR=H-h=i0Mp{zM|5^`Vb3vl+2R5jX z#%W(^(PgWNP{nT7|7CT^omDm0rp$es5e;S`jC>TIT{pD_yiPXG$^C~?`i+L08{c(# z;LLCmws>?P!+($ZkLwG!T~XDt*!i+5yxPoqTMFhbexiOjqPHcUC?!s=?YiNQR=O$W zh?u$TBj6%gP!LU(uqG+{U-Xm!QQm(LHBwLXh&g8)>HlI_s6+^t4)62-s*dw(fi_?p zava^7)=+_5U*$`5enRnUV+yF5A(i8LoovNl)f#8?dp!ibUY;2J+GU99qs^dhq!K0r zGtM-G{-+s_AZdLg+4;}b#Ms=WH!b(X)gF)qPrjSPH3ZgodY1uXR1?Y52qT&6$kcH? zRS*){BjcM690P;-Nzc~AD63{S{Pzz3)g*BV2^3b%M}9q(a%U#;D|#y|2P*$rf0h5g zorQoq*$}du?9s|@NE*Ri8Qij|#-RU=6I60M~n(a@PfqhEhGo3D} zY=_&W?go%ej#TI@V-1ErnU}Br3eLnek+OokYl%x>=g4+V#t!EkqZ($k;E^ zwl95JcIydu4I}&HdY7rzIe+@1GB1CW_Do-xq4uu(uXY|<%XqS~FoSj;fhcW$W!}3n z+Qyd)E9Ldw?cdF1Nw*NSNpqdm-)*=u3vTokYjd<|@v)hWJg2f`{9Mj__F_BF%4JM# zV8IlDiXwkr`z*CO4=AE^2&ZV;ed)~&wxI)dtc}7ogGf{3D|rWcmzp@-enm2=@Hkw~ ztCZJ?r&T@)pp(d^Mc}UfnrdHX;EbEVP)!o*6b-G{d#b}LvUEk;@F4+AI|ufFAw~0w zr!NPFPOO8+`OgC>d9u9kaknT4hZ){RM#W8i60UZ!6BVH1;g_HmHoPrKa~*uKL$C8W zoX6Kk`EC_>Zbc?5N{g8tQz@^F^e=E$(672&Tp3i(iW+!jP>58y?_`#dp?T#<(BB{( z6P7ml1u@&ZMaPfvX4RVwD0fyc2A3N~=}H&WvZAzS$~qaLj1Hic#)oIDlv;vzGkf#S z`8xMG>Wz{p9+rg}qG+lLzyoKT!wm04cU^Tpqsafdor79a{7j3_?k)bbwUqV;#t~YE zjxSxO^6vgb-F+D`({HI#bootXPy{RL{)}*^6r7uOCXOtolL!i7>UE~u{912VN~PD{ z#l#OWduz)M6QSBG0^yDQvfATJTtfwOq(f-Ss-k%`{F-y9B7bphX)V~QQHdq#dYD%i=1c-X zg|SFrRshvGOV(mHsvV{nstAeH~bDg*>jO=|}TEqK` ziB&OslB1MCU-LWSyD#g_Udp85QUR-?;Sdxglr@n4nboEp)S2-*?U>Mnok)a!T4X|4 z_d}J#+q~9np2QmjM%v_hnOJssw^h9OXDhcaG2HFE_I$@-zH0IstXmNc{{ zJo}LiQ0GK8sx{MB?c${=@+OOb>VlEh3_y!^G@Qp-auj(WQMu===1rT1)Yw^3Kt=4Estb!p!t11{HgPN0`g;tfDH$cEc2_y!T zS9^z6L}`!c>$|V4EM5^Y(U5B(5eu>~Bc08Q@^<9D_FRtKc8{?_VTtq=@9!eP%8w7v z!sx-UsFFfkXQ)qQetR=2Q7~`(`6yY(_pFl3N6#J3d*stl#Sj06^Rg%d1{E4cX;TR% z5`h!9!~q_0@HSQ}mY+k~IVq9-Oqj-(@B1%+OM-U6`^_)LfVN*P|s~4hFh7p z7(i^{0?QrawUYH}mwj0Y5u@vsuH4jp-#f{EleelW8b>eDM;K}&31tEyq4?|u!;jTV z--ZKCjqc)c9@UnO25PDI0_kcXLYPg+#a>&?r$l4n!aK4y!0PiRS|A{hJ7lE`& z#{q2uk_{;h?F=lH8rh?YKFbu`zGSRiXRgdzD6I@Msnjq^Z{&t~bQiC)4YUkLJS95@ z+?~Q+*lq>LxH2LeAVQTL2uI(z5^sBfQ$$(A=p!|0$Af?#0u?C6%NR~GbAAlJnJ|%_ z+Zif{D}|h9q$4(U-KyxX$JDoOZ84M1wNa^$2?ibgrc%wmm&TcebkvRZ4hTg)csX1{ zLHySAuksA@mU6;~iWpQkSKhp%R^EI#o5vD2b(vmqEX9L|nvw_TUmO+Jnc08{XN5ae zmvaoGw4~!Qr)c@|q$)H-nzG=u{D}efJj8r`ov}PRQ93E?#r(!oL>IyX*)LYrRy%@& z0Xo5Jf29!mc)ePn#8_k++y#(cTkXM39rHD+syxY~l`bCkKJ~`VFNaKklDEHUI4W* zRsaYG^!D@Pr?Mv1-QB3K4xkw5SEaWJa&mc+vJE~~m~7jp~80E)rzlQ!d7wL06g>8#UrS-~_lv`j3ev`T_nX z`E@+HTv+>dWohF~zm@gJOdl5$A*pC>Lxq(BN5w0jL zy@wPdrl3_Ch~aJ@hZ5J`*#`pb_is%sh!^sk2umsR2J`jVPI1h|KrhanT=6GLk>m|{ znB>jvxPMzh#H894B;rRAh~(O5BR701^%%s6f;)Bw-tDHTR%>~d9Rt=P(re*!)j`IT zL$~E@@<8(9^IUZGH_B%;HEW{ z8CzwmF3xYmvq6?_=nFYUgJ0X;;GByIJFG6`v@qw-o`&r?AYM2^KNb8o?I zy4kW4@%!T;sk~#<2q;h?)}8hBlxlYt^?nw0)e5&WH=wvcltwJS&>(D4a^^E{rOKkj6gwRM_)WV0arw7 zs~>exTPElz7oI_(;RVnOh73q``4n-VZ3f3&nQcZ^>bHX!R+nhAEP+f?Q7pfI1CDg2(YhUwe% z>wM2e@`q^{-YTAa=`|t7Y-A=JYh7U~M5f(TLFMkgd0j+TLvAjpnTSC-m{=OKrw2}F6uI8?Qd)@?Yk8-8Hco^^ zayB3(Neh5mt%XP<%p@r;Nh`D}Ze7Jq9mtZ`2jj2Zh6}^9cV0);<_Zrg&MK635*Z<4 zc@v5HYy-fqayfMY$h~4E1CCOygt{_5iT;|Ai}m_FbU+bLFk4w?tRYPZ#lDbB5NDk^wS^hsJZZ`r(<;$lyG<7u0?$H@It8ta!wbN3q{)nPC%A@| zJsxL0Tk1d^DPSGM0Lum{3)Vx!5+TOGG)}8Fj#S$vU$e89S0wgRi zhMpbD3G#u>DA^YaAW!fU!zfizj7co*dlw!LY7ulu={0|ymu8d}E$`aJ7*G(gGk3Z< zzze9ov#+vXsjfac4#I&Pn=U2Pq@mh#R%XPAoEyKlYh~AF2;sE~c6+!lKAV)biByXO zlZziYk!im*m2z@NI0$QUY%=DcGnvTV%4;BFGN;&8U{xSE48IP}|6S-JN`!s5;DQ87 zgCm#QytbWnk!zD)+f6gl^-kWIc(Qc}#xE8C<$&#jCz-mHMOKv1-S0fbHB7K5bYOyn4!RWMu}{qo0|2mmN6 zAcGiSQTahXwvtZz^SZL@RN1(U=uu6um-rclpaOPSZB_wz z%uxie#we|ygW%Cw?LZCzJe_T0I6%Uj{8Lt>-Kt%dL`8I~KRmj1N0Q(Rpo{aLer8m2 zMi}&7yXhKX!1L1nNSY$xo+a|!4=2l{u8LzF{GN$c>K*IWS|BF8$UW<)ctHzpaoZ{Cr0ZeWXn$y(={n5{e7i=(eOGuiJTEj3y_8U` z+(GTC?nELvs}i#a{Z@MfxH+(x_V&^5FEuXPKk9l^I3 zC_qbg&0nJgSCS+{GqKo<6{aw<_f!?yA|hD^-4e`UjUNF15wzSVVhnI=@=&j(5JLVG zlb)MD&I5v0xgFCHx1-v(=bi1EoXDM*_BQ06vJZLFwJ04`BUe4O4B#zyZ zHp&2A70V6*n&S>Q<7tXdCb|wHQ~;M4&ikZfF_hs+KwP9C%`|qhvr#PSIw>d)1rG#i zE!_mzkMSOW7Tms2sU$RxpdxajTX|tX@lLkmHB|*vyiy0|Ogy@ahZ2;Cb__h;gDkX# ztpkLJUOFFfShgq(DAomRl0LM{SZ<qO~Ze$sd zOS8wb(HSAA1QJ056$=Sin1M2I-_T`)MlC)nEplWkvw@4C5z}xYfi5Ik$sDt4g)g>w zaIcC!pCGa_yQo!kujYme5Bu5$htb5NvDFWcrK>;AkiO-;X8r4rZ}DC@D}oLQ*j;Om zop<=5?)##GyFEV~_{ev_Rjr`Fx`0%`4&d4ynu-LinV3EAYFAVue_q-+RJ+alGP*ir zUG<<2>a$e~pMH~@Ua1LisX$eB8yD}0(mLj z%Ak8vRnWjSGdOF@Hq`RR4}m2)P0aZVy&AAdi2|d`Rkzo@K*Pm9PR4bMj2oBKRZ@T*1 zL>(3R*8T<6x}g;b2P~bpk zlSu+d5l`lMY8nEKw!8f=Tk8VXklm1#d;os1rWSB6pd8f(6+i{OV4WCb5;+UH12AKl z0kzkki^b{8h$nDLD}n^kIuY`&dXn&H9i~h`JMgG{=&m9j7G>=!sMH}w^flIcZ@AL{ zf4C21xgf4vQr+*Dhy}-{MFF)31Mk`&n``u1k|qW(pkhVU_m>!*p_yw?2aUCkZue`P zebL#J7105mcS-P#^(tQh8OCidECeY{7GpKvh6$YjOG`YC)e!!v3<8!7Ovz888 z+alNA5AC3mDS^o$RK9A>cUX%#^~{U9AcZeoffVoPLME07m=t5RA^^R#-uM*}7mm$j zB|+Lunph@1Fz_h=iGZ;9`a6!fWq&8tSz$Y#>_T`*-%~~YQc&9q6VLS5!GXOtSNmvr zQMmw)_K2xX^5b6p&HadgGnK87(^Qxw9&to!i6v5nj>wPO(GDhJT0Q^#mbw{AFfm|- zH<$|?EN+POsTwfDZ5Dh+%bq}1UF>l+6wIgE-Sm)zU=-CMUI_Aog^8F~&qv)du1fzN z#vNJ?rckp1%~R~SD{%>K1%nANh{;^P4BR1DTEji{&S+I#0SHw_i7O_IWhz=z_3jK* zv2Y_dk{AvCc5YgBnu;h*S`bk|wI7(EAb<{eaMqQ9t_lNs2$>wvp@OliuA(v0zu8&A zy6W;fPD^wC(jkH3_F0;$6@VapwPuh5UN9!*v>+WZ9B5KBZvFjLoj!K5jVkhOP9F{E zj*MOlc~u0vb1};ddRBZeC4OQ5;Y4P_u_*lOfL&zdjn8{IypD4sj1Rx9`6X@0c3~0vW zD^5PaH3fq-P&)t!tZir9=~~$+dP`qYJ6gx^axtfcGvH1{$I0v${M4I$-f>AwL#=s+ zG^P`5BxdhXt7~O@7dmxsH!J=Pl?w3Ron(-v<>539_q;(X!01|lHE38}4NWFS4BSBz zvobPu~ZkG zm*~4p^^ZYb9$m(y2+;#iXYNd-0#d_=8_$7RcvTA2H%Ik9Aj}4*-uU~g1M>$?)9Z!L zeJ5Jg`i#KPh4;sM=SIuQ6&?~*d%{tKY)rX19(!S}JT%W?0&5m~HFy!zCZ3PoTYD|)WP6!H@giC zpUMTi#7aijuG8|B@|y}1h0f9d*2d_>3KH4ZS#zoqDX~+A+ zFtkcj20#Ir8wDcXY*#*{mWK8Vzefm>)H@;to(6pyGHq($2a%vYTCMys9@Ke(+*WXB108C3 zwQ;sOn_nN)N8tM%uH@~ssPHf&wq577ZLQ0-9LZTR=w#gZ7c$zwZ6{J#VuRq>>EX5? zfy)+$h5=xvQfVm$w3s1tbJsPm9hyvWpK85U28Pa^+~F(vU7$0sz*>SUb=2NN^FdD=xjhYcn=&p;P2#h~ZF(bF+waeo|=c6}z#WunFCS`s-zV4=i>FAt^$w#Hw{LsxK zdn2+Y9i3hQ_f!)_pG&)HGZp*ZUcMuWO;2YiH`Ci5dW0QFuA_!0U<_le-hr+fBA4Q= z^~u;Y9Q^XCnbEdqHG9k!8S$`W@|y%&b$A1Psy_)g$)Y1oJQg!#cu}*XRN7d#ZNEX$dbm zX{eT_-{o`89P%^ZF$QTS!7}CBYoWCT55I?3CdHnP&3NEW74FaSiIzCMA~E${+zgS(JXP`9=NgK z`(`q)hlbE4l({}x;Pu~3`A3(7v1H^J+KK!dA!b;n0=VpfJ3~Elt`$0?u{_v3j5ybp zupgupC7t62?!&MnD-2w!0-6LB3ALb%=K~aikT$F-+Nc=?G4&!iC>K?W8^S`C%(`5q zZ4z~V6>AUj7=9dXEcYZdtOKv$oQQN`5AII{qh|sups=K0EAEiC-lLtYxUh6O89qI-Hp6unmmUDFR3$d4zyJ; zBCg!qB+f=Krh}kw3Ks6pQ9irB`Qo*HFu28_j<5i3cF;&2+~G@{zDEr9-B>+pwh?O6 zCwK|7J~0aJ!4VJ3l$#`ymqdtgLEC=Hv^{&j2R~nOuy{^2Zw?R3l$sRi(s6wJ;kR81 zSFCO5=H4BnjkPy1;)W|3lli2Bh{|uYVrg{_Yz^P%n@A=k`ff`9D82hzD`=jlw)tXl zbi?w7XHKeWK0w!O&BfqfurGKzSOHT2&npS$gt>e7KP(3A^E6_m@EGm?FxCJtma;1= zR;8+WZz7)=qVq&Pg(H@!C`qKihqV%9)YuOk;v!My;%l@{&m_YWZZT8Crw=Oa)5@!Y zxeoh6ZKK?!dOzeZz1DZ;CUvB!0ixq31MN2Lbna|I1|H%U9)=b@SOQPI>86)fv6^Xg zox7&|VynRr<3#?{Gl)G&B85OO>+p00@CL~I_!?$H0_*_MUoSEW7QRLWdy5nRPP6~+ zOX_-%{TcoR$RGSZUV%M6Zc*4zjYr)yhEKRjRWa_3STaY7y7D>6qCa<#4MH~zLW%(` zNe})C&*qyyq*I0&`Zl4`k%PD;UzxUTPspXe0 zkw4bsl;e9?hhLtbhStL(qOhBOGS?FCa&iC(wR=g6SA%8s-Jjct92YJdYwY&KFEjQ7 zd(-Wq*X#MP$moY)q|*f&N81RU!A0n3;KuH23z?72>xO}5jhBo$R)q&+cnozWtxf12 z#X3BtUHWAg7{Vj1J()+^i}1+%XqU3}IE`yaW!yFY_-T(!eX&s)f0G@V5 z~`D)GA^psZk1^sr3X@KjYUqkfypY#2(9;u|%Epc@OJrT`wI0?&2` zNL2n(A~r_tzTaYB>23qdy)M}9z&$LR!e)ux^;fQH1=~89t_z$9MzofLA5KDZ7cT+lB4zys~RamrB}qJpEBVz>rXzPU1#+4Kb2wP!!{v+Kyd+6c3Zop+@8231Zm zMSrKwvp~C@Kt480Mw&w0`WG_o&Fg&KUJ%eaEdUvoIMZ7dg6;Q%5{GB-zN5mgKwPi{ zdOYr>Xc%v>sc6Bq4^}!hURg=zIi`7WPR=R!~|y~6NPSK1P$CD-Bu>BrF0>5dWyF8GXcUs`%{15_g z-ZGDpK5?p1Egiam+Hh3#Y~GQtpPHyL|BAU`lPL1+fS_8bkS=0Ju<(dY9=a`k@scUF zDPXhY(e>mwaJF_sFw$u#?-ko(gN}b^9lL4RbS2r`qH^#40ag!!X|jr>79+vLGmAYdUx5COpU8=1A$9za&l>5LbCNmkOp z$Mhs=l1I7OLwF_iBBpSdB7Vti(!o7HY!iVPNOd>Hfqbwje*ub6wlcv^UncK@DKum1ss6QptPjxr`qt5SNnQ=w$|s8=|_5G!)WvEwHx!*A;1CR^wP!g*q2Ri8qjCu@f&)59_Zk=+#-uw7D(-PCXl@swpaut?;wWH z=h=>D?qWS8bDJ!9xszz=kUVB;Af&O^^U$BLO^%1QpHgf;!QDEM zJa=Z^cjle>^2=1KaqtVx8P$eo4vXM9#52Y@iJIo>kDh7+{VBJ2nhVCDcKbJ{6CLK# zj5?SIcxI6_(fRUr5roJCDG}k^y`U>}$#Lr8@+}XFM!{Iw(D8hM?2{l5HI*-grCq}6 z0|*-gEiEu3S&C!~8zlgzTwvuVo1`2XrwSFz9;J~zw`godrS?6P%)AY;J@F)xr>s)kbu92&Zw0l!S3!1^U6}2~m z+Mtvy_%X#{Mjdz8dZ!P`8CC3^;uOzR-cCf*#HcE9oiWIr{-Q_p;ff*=Qo9DnnUZ|K z4X^`V76?pkMw|)6!%2B<#9`%42AEy&Hx@wC|1Qx2M_>ixWayN--n@M^dV29=&<-Y) z$&W13dxP5^Oh)KwXP4{SV6|%J3lEgvZb#F|NbDG&1HONX2~HRnJg|0;sjSTf=51|9 z%p1ip`7dz-!Yr9!rz=8*9-^Hps5MN^5K1o+m^-7tIuIW9XkqGMk)Mx$v(b2IRWbft` z!!crDFa|sSYbPDIjbtMs6*gKN8{nAE^bl{s44brF#}*U8^0gWM{y|9ul>fznuFhX* z$^G{cZ(@Ax(<>K=E|W_NQiO#5R%B1-Q5{%}Z=}f3^aY5aZM^!?{*+Vq2Bf#&2_qds z*FvbQLM#L-hZ5Xc2h^dLX+Xu5Ee=suM2Z z%L3J!l#}JTkJ~@x8>Z`SOY;$Gd|T3khL69r@PzDbP|7&rQlI8b48p6BI73>~nF?hm zq7_7UZmo559->`%64}rJPI5Otr(-!d@0BEx+S96W&6C2_>HFoAR=au~_ols-*TK3y z>5a4XV`@KORUoCieg`s7oF4>&Zp^&X-rheG56HN$Pg*>34bB%a&A$K>?v*v_H5Lw| zxu6Z^3tjl)eb3uIbj^68Orph2z9pBuZg;=d9X)*QD*HFf6EI`v`PkR>n42T_r33|J zJ-00+mG!NZE$!Z_Ea@@i8fqoSS6{rkguLiu6HDilmJ=_OfFH(L-Ykh|?!P`#V|PT@O&m!54)k=?{MFsD+Ljy|L);AT9ElGwqGOp9T>WsPINc?gD_p;OJVxxd0+C9 z1FJ%%o5lTNO!^DXx$0J;D$*p$!jjYBf66aC+%pz9l*FPsc1^rfk&Mwu%cW?V3)ZD`yl9N|W`oUa+i$|PZzQlNHjr+4g`0Vw8-+B0VD@AB zC$A4L7joUH5rAOxP$GH7ynw%TF+%8zcpdl0E9GUatxIPBR3PwQUh*2R0Waug-JnS7 zzm5}lu{XRfn~UGN2M9Fk00RB56!OYHY}&y#%1=3Nkbm9JFxG-VYydzR@R|bERuQ+> F`UjBQYb*c& literal 0 HcmV?d00001 diff --git a/gini.py b/gini.py new file mode 100644 index 0000000..ccfe822 --- /dev/null +++ b/gini.py @@ -0,0 +1,92 @@ +import pandas as pd +def test_split(index, value, dataset): + left, right = list(), list() + for row in dataset: + if row[index] < value: + left.append(row) + else: + right.append(row) + return left, right + +# Calculate the Gini index for a split dataset +def gini_index(groups, classes): + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +# Select the best split point for a dataset +def get_split(dataset): + class_values = list(set(row[-1] for row in dataset)) + b_index, b_value, b_score, b_groups = 999, 999, 999, None + for index in range(len(dataset[0])-1): + for row in dataset: + groups = test_split(index, row[index], dataset) + gini = gini_index(groups, class_values) + if gini < b_score: + b_index, b_value, b_score, b_groups = index, row[index], gini, groups + return {'index':b_index, 'value':b_value, 'groups':b_groups} + +# Create a terminal node value +def to_terminal(group): + outcomes = [row[-1] for row in group] + return max(set(outcomes), key=outcomes.count) + +# Create child splits for a node or make terminal +def split(node, max_depth, min_size, depth): + left, right = node['groups'] + del(node['groups']) + # check for a no split + if not left or not right: + node['left'] = node['right'] = to_terminal(left + right) + return + # check for max depth + if depth >= max_depth: + node['left'], node['right'] = to_terminal(left), to_terminal(right) + return + # process left child + if len(left) <= min_size: + node['left'] = to_terminal(left) + else: + node['left'] = get_split(left) + split(node['left'], max_depth, min_size, depth+1) + # process right child + if len(right) <= min_size: + node['right'] = to_terminal(right) + else: + node['right'] = get_split(right) + split(node['right'], max_depth, min_size, depth+1) + +# Build a decision tree +def build_tree(train, max_depth, min_size): + root = get_split(train) + split(root, max_depth, min_size, 1) + return root + +# Print a decision tree +def print_tree(node, depth=0): + if isinstance(node, dict): + print('%s[X%d < %.3f]' % ((depth*' ', (node['index']+1), node['value']))) + print_tree(node['left'], depth+1) + print_tree(node['right'], depth+1) + else: + print('%s[%s]' % ((depth*' ', node))) + + +if __name__ == "__main__": + dataset = pd.read_excel('Train.xlsx', sheet_name="Sheet1").to_numpy() + tree = build_tree(dataset, 1, 1) + print_tree(tree) \ No newline at end of file