Skip to content

Commit c8111cf

Browse files
committed
commit_ayan_23_01_2020
1 parent 304f72f commit c8111cf

1 file changed

Lines changed: 164 additions & 0 deletions

File tree

Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
import numpy as np
2+
import pandas as pd
3+
#import math
4+
#import random
5+
#from numpy.linalg import *
6+
from sklearn.impute import SimpleImputer
7+
from calculate_glass_binary_bayesian import calculate_accuracy, calculate_accuracy_naive
8+
#from sklearn.preprocessing import Binarizer
9+
#from sklearn import datasets
10+
11+
################################################################################################################################################
12+
#load data
13+
data = pd.read_excel("data/generated_binary_data.xlsx", index_col=None, header=None)
14+
data = data.transpose()
15+
16+
#data = data.drop(data.columns[[0]], axis=1)
17+
#print (data)
18+
19+
#read Data
20+
#data = pd.read_csv("glass3.data", names = ['Index', 'RI', 'Sodium', 'Magnesium', 'Aluminum', 'Silicon', 'Potassium', 'Calcium', 'Barium', 'Iron','class'])
21+
#data = data.drop('Index',axis=1)
22+
#print "Original data:", data
23+
24+
print ("Original data shape:", data.shape)
25+
#print data
26+
np.seterr(divide = 'ignore')
27+
np.seterr(all = 'ignore')
28+
29+
################################################################################################################################################
30+
#check if missing value (cleaning)
31+
missing_values = ["n/a", "na", "--","?", " ","NA"]
32+
data = data.replace(missing_values, np.nan)
33+
feat_miss = data.columns[data.isnull().any()]
34+
if feat_miss.size == 0:
35+
print ("Data is clean")
36+
else:
37+
print ("Missing data shape before:", feat_miss.shape)
38+
imputer = SimpleImputer(copy=True, fill_value=None, missing_values=np.nan, strategy='calculate_iris', verbose=0)
39+
data[feat_miss] = imputer.fit_transform(data[feat_miss])
40+
feat_miss = data.columns[data.isnull().any()]
41+
print ("Missing data shape after:", feat_miss.shape)
42+
43+
################################################################################################################################################
44+
cols = ['RI', 'Sodium', 'Magnesium', 'Aluminum', 'Silicon', 'Potassium', 'Calcium', 'Barium', 'Iron','class']
45+
#binarization
46+
def get_binary(dataset):
47+
48+
features = dataset.shape[1]
49+
#print (features)
50+
dataset = dataset.iloc[:,:]
51+
datacol = np.array(dataset.iloc[1:,-1])
52+
datacol = np.reshape(datacol,(datacol.shape[0],1))
53+
dataset = dataset.iloc[1:,:features-1].values
54+
#dataset.astype(float)
55+
'''
56+
meanValue = np.reshape(np.mean(dataset,axis=0), (1, features-1))
57+
#print (meanValue)
58+
59+
dataset[dataset < meanValue] = 0.0
60+
dataset[dataset > meanValue] = 1.0
61+
'''
62+
#print (dataset)
63+
64+
dataset = np.reshape(dataset,(dataset.shape[0], dataset.shape[1]))
65+
66+
dataset = np.concatenate((dataset, datacol), axis = 1)
67+
#print (datacol.shape, dataset.shape)
68+
np.random.shuffle(dataset)
69+
dataset = pd.DataFrame(dataset, columns = cols)
70+
return dataset
71+
72+
data = data.iloc[:, 1:].values
73+
data = pd.DataFrame(data, columns = cols)
74+
data = get_binary(data) #bluff, conversion
75+
print ("Shape after shuffling", data)
76+
77+
################################################################################################################################################
78+
#removed head and index and convert to numpy array
79+
data = data.iloc[:, :].values #np.random.shuffle(data) print data.size
80+
#print "Cleaned data:", data [175,:]
81+
print ("Cleaned data shape:", data.shape)
82+
83+
################################################################################################################################################
84+
#data Training
85+
features = np.size(data,1)-1 # 149 #column [all columns except last one as it has predicted class]
86+
#print features
87+
samples = np.size(data,0) #row
88+
89+
################################################################################################################################################
90+
#class finding
91+
totalClass = data[:, features]
92+
totalClass = (np.sort(np.unique(np.array(totalClass))))
93+
totalClass = np.reshape(totalClass,[totalClass.size,1]) #convert to (*,1) array
94+
print ("classes: ", totalClass)
95+
96+
################################################################################################################################################
97+
fold = 5
98+
foldSize = (int)(((float)(samples)/fold))
99+
print ("fold Size: ", foldSize)
100+
101+
################################################################################################################################################
102+
#pure test data
103+
data_test = data[:(samples-fold*foldSize),:]
104+
105+
################################################################################################################################################
106+
splitArray = np.split(data[:(fold*foldSize),:], fold)
107+
print ("Each split size: ", splitArray[0].shape)
108+
print ("Total split: ", len(splitArray))
109+
110+
################################################################################################################################################
111+
#call method
112+
print ("For fold starts for optimal bayesian")
113+
accuracy = []
114+
for i in range(fold-1):
115+
116+
training_idx = []
117+
118+
test_idx = splitArray[i]
119+
for j in range(len(splitArray)):
120+
if j !=i:
121+
training_idx.append(splitArray[i])
122+
123+
training_idx = np.array(np.concatenate((training_idx), axis=0))
124+
#print training_idx, training_idx.shape
125+
126+
data_train, data_cv_test = training_idx, test_idx
127+
#print "Train data Set: ", data_train.shape
128+
#print "CV Test data Set: ", data_cv_test.shape
129+
130+
print ("For fold starts: ", (i+1))
131+
accuracyVal = calculate_accuracy(data_train,features,data_cv_test,totalClass,foldSize)
132+
accuracy.append(accuracyVal)
133+
print ("For fold ends ")
134+
135+
################################################################################################################################################
136+
#call method
137+
print ("For fold starts for naive bayesian")
138+
accuracy2 = []
139+
for i in range(fold-1):
140+
141+
training_idx = []
142+
143+
test_idx = splitArray[i]
144+
for j in range(len(splitArray)):
145+
if j !=i:
146+
training_idx.append(splitArray[i])
147+
148+
training_idx = np.array(np.concatenate((training_idx), axis=0))
149+
#print training_idx, training_idx.shape
150+
151+
data_train, data_cv_test = training_idx, test_idx
152+
#print "Train data Set: ", data_train.shape
153+
#print "CV Test data Set: ", data_cv_test.shape
154+
155+
print ("For fold starts: ", (i+1))
156+
accuracyVal = calculate_accuracy_naive(data_train,features,data_cv_test,totalClass,foldSize)
157+
accuracy2.append(accuracyVal)
158+
print ("For fold ends ")
159+
160+
################################################################################################################################################
161+
print ("Average Cross Validation Accuracy for bayesian: ", sum(accuracy) / len(accuracy))
162+
163+
################################################################################################################################################
164+
print ("Average Cross Validation Accuracy for naive bayes: ", sum(accuracy2) / len(accuracy2))

0 commit comments

Comments
 (0)