|
| 1 | +import numpy as np |
| 2 | +import pandas as pd |
| 3 | +#import math |
| 4 | +#import random |
| 5 | +#from numpy.linalg import * |
| 6 | +from sklearn.impute import SimpleImputer |
| 7 | +from calculate_glass_binary_bayesian import calculate_accuracy, calculate_accuracy_naive |
| 8 | +#from sklearn.preprocessing import Binarizer |
| 9 | +#from sklearn import datasets |
| 10 | + |
| 11 | +################################################################################################################################################ |
| 12 | +#load data |
| 13 | +data = pd.read_excel("data/generated_binary_data.xlsx", index_col=None, header=None) |
| 14 | +data = data.transpose() |
| 15 | + |
| 16 | +#data = data.drop(data.columns[[0]], axis=1) |
| 17 | +#print (data) |
| 18 | + |
| 19 | +#read Data |
| 20 | +#data = pd.read_csv("glass3.data", names = ['Index', 'RI', 'Sodium', 'Magnesium', 'Aluminum', 'Silicon', 'Potassium', 'Calcium', 'Barium', 'Iron','class']) |
| 21 | +#data = data.drop('Index',axis=1) |
| 22 | +#print "Original data:", data |
| 23 | + |
| 24 | +print ("Original data shape:", data.shape) |
| 25 | +#print data |
| 26 | +np.seterr(divide = 'ignore') |
| 27 | +np.seterr(all = 'ignore') |
| 28 | + |
| 29 | +################################################################################################################################################ |
| 30 | +#check if missing value (cleaning) |
| 31 | +missing_values = ["n/a", "na", "--","?", " ","NA"] |
| 32 | +data = data.replace(missing_values, np.nan) |
| 33 | +feat_miss = data.columns[data.isnull().any()] |
| 34 | +if feat_miss.size == 0: |
| 35 | + print ("Data is clean") |
| 36 | +else: |
| 37 | + print ("Missing data shape before:", feat_miss.shape) |
| 38 | + imputer = SimpleImputer(copy=True, fill_value=None, missing_values=np.nan, strategy='calculate_iris', verbose=0) |
| 39 | + data[feat_miss] = imputer.fit_transform(data[feat_miss]) |
| 40 | + feat_miss = data.columns[data.isnull().any()] |
| 41 | + print ("Missing data shape after:", feat_miss.shape) |
| 42 | + |
| 43 | +################################################################################################################################################ |
| 44 | +cols = ['RI', 'Sodium', 'Magnesium', 'Aluminum', 'Silicon', 'Potassium', 'Calcium', 'Barium', 'Iron','class'] |
| 45 | +#binarization |
| 46 | +def get_binary(dataset): |
| 47 | + |
| 48 | + features = dataset.shape[1] |
| 49 | + #print (features) |
| 50 | + dataset = dataset.iloc[:,:] |
| 51 | + datacol = np.array(dataset.iloc[1:,-1]) |
| 52 | + datacol = np.reshape(datacol,(datacol.shape[0],1)) |
| 53 | + dataset = dataset.iloc[1:,:features-1].values |
| 54 | + #dataset.astype(float) |
| 55 | + ''' |
| 56 | + meanValue = np.reshape(np.mean(dataset,axis=0), (1, features-1)) |
| 57 | + #print (meanValue) |
| 58 | + |
| 59 | + dataset[dataset < meanValue] = 0.0 |
| 60 | + dataset[dataset > meanValue] = 1.0 |
| 61 | + ''' |
| 62 | + #print (dataset) |
| 63 | + |
| 64 | + dataset = np.reshape(dataset,(dataset.shape[0], dataset.shape[1])) |
| 65 | + |
| 66 | + dataset = np.concatenate((dataset, datacol), axis = 1) |
| 67 | + #print (datacol.shape, dataset.shape) |
| 68 | + np.random.shuffle(dataset) |
| 69 | + dataset = pd.DataFrame(dataset, columns = cols) |
| 70 | + return dataset |
| 71 | + |
| 72 | +data = data.iloc[:, 1:].values |
| 73 | +data = pd.DataFrame(data, columns = cols) |
| 74 | +data = get_binary(data) #bluff, conversion |
| 75 | +print ("Shape after shuffling", data) |
| 76 | + |
| 77 | +################################################################################################################################################ |
| 78 | +#removed head and index and convert to numpy array |
| 79 | +data = data.iloc[:, :].values #np.random.shuffle(data) print data.size |
| 80 | +#print "Cleaned data:", data [175,:] |
| 81 | +print ("Cleaned data shape:", data.shape) |
| 82 | + |
| 83 | +################################################################################################################################################ |
| 84 | +#data Training |
| 85 | +features = np.size(data,1)-1 # 149 #column [all columns except last one as it has predicted class] |
| 86 | +#print features |
| 87 | +samples = np.size(data,0) #row |
| 88 | + |
| 89 | +################################################################################################################################################ |
| 90 | +#class finding |
| 91 | +totalClass = data[:, features] |
| 92 | +totalClass = (np.sort(np.unique(np.array(totalClass)))) |
| 93 | +totalClass = np.reshape(totalClass,[totalClass.size,1]) #convert to (*,1) array |
| 94 | +print ("classes: ", totalClass) |
| 95 | + |
| 96 | +################################################################################################################################################ |
| 97 | +fold = 5 |
| 98 | +foldSize = (int)(((float)(samples)/fold)) |
| 99 | +print ("fold Size: ", foldSize) |
| 100 | + |
| 101 | +################################################################################################################################################ |
| 102 | +#pure test data |
| 103 | +data_test = data[:(samples-fold*foldSize),:] |
| 104 | + |
| 105 | +################################################################################################################################################ |
| 106 | +splitArray = np.split(data[:(fold*foldSize),:], fold) |
| 107 | +print ("Each split size: ", splitArray[0].shape) |
| 108 | +print ("Total split: ", len(splitArray)) |
| 109 | + |
| 110 | +################################################################################################################################################ |
| 111 | +#call method |
| 112 | +print ("For fold starts for optimal bayesian") |
| 113 | +accuracy = [] |
| 114 | +for i in range(fold-1): |
| 115 | + |
| 116 | + training_idx = [] |
| 117 | + |
| 118 | + test_idx = splitArray[i] |
| 119 | + for j in range(len(splitArray)): |
| 120 | + if j !=i: |
| 121 | + training_idx.append(splitArray[i]) |
| 122 | + |
| 123 | + training_idx = np.array(np.concatenate((training_idx), axis=0)) |
| 124 | + #print training_idx, training_idx.shape |
| 125 | + |
| 126 | + data_train, data_cv_test = training_idx, test_idx |
| 127 | + #print "Train data Set: ", data_train.shape |
| 128 | + #print "CV Test data Set: ", data_cv_test.shape |
| 129 | + |
| 130 | + print ("For fold starts: ", (i+1)) |
| 131 | + accuracyVal = calculate_accuracy(data_train,features,data_cv_test,totalClass,foldSize) |
| 132 | + accuracy.append(accuracyVal) |
| 133 | + print ("For fold ends ") |
| 134 | + |
| 135 | +################################################################################################################################################ |
| 136 | +#call method |
| 137 | +print ("For fold starts for naive bayesian") |
| 138 | +accuracy2 = [] |
| 139 | +for i in range(fold-1): |
| 140 | + |
| 141 | + training_idx = [] |
| 142 | + |
| 143 | + test_idx = splitArray[i] |
| 144 | + for j in range(len(splitArray)): |
| 145 | + if j !=i: |
| 146 | + training_idx.append(splitArray[i]) |
| 147 | + |
| 148 | + training_idx = np.array(np.concatenate((training_idx), axis=0)) |
| 149 | + #print training_idx, training_idx.shape |
| 150 | + |
| 151 | + data_train, data_cv_test = training_idx, test_idx |
| 152 | + #print "Train data Set: ", data_train.shape |
| 153 | + #print "CV Test data Set: ", data_cv_test.shape |
| 154 | + |
| 155 | + print ("For fold starts: ", (i+1)) |
| 156 | + accuracyVal = calculate_accuracy_naive(data_train,features,data_cv_test,totalClass,foldSize) |
| 157 | + accuracy2.append(accuracyVal) |
| 158 | + print ("For fold ends ") |
| 159 | + |
| 160 | +################################################################################################################################################ |
| 161 | +print ("Average Cross Validation Accuracy for bayesian: ", sum(accuracy) / len(accuracy)) |
| 162 | + |
| 163 | +################################################################################################################################################ |
| 164 | +print ("Average Cross Validation Accuracy for naive bayes: ", sum(accuracy2) / len(accuracy2)) |
0 commit comments