#Please Note:
#Error converting from notebook to html i.e. running plots
#if plots don't show, manually include a local version of require.js. Get a copy here - https://requirejs.org/docs/download.html#requirejs
%%HTML
<script src="require.js"></script>


#*************************************************************#
                         #Import Libraries#
#*************************************************************#

import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd

# for visualization
import matplotlib.pyplot as plt, seaborn as sb
import plotly.express as px
import plotly.io as pio

import matplotlib.pyplot as plt

#for deep learning

#the ImageDataGenerator class in keras provides a suite of techniques for scaling pixel values in our image 
#dataset prior to modeling.
from tensorflow.keras.preprocessing.image import ImageDataGenerator

from tensorflow.keras import Sequential, Input
from tensorflow.keras.layers import Dense, Dropout,BatchNormalization
from tensorflow.keras import layers
from tensorflow import keras
import os
from PIL import Image
from tensorflow.keras.utils import to_categorical
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from keras.callbacks import EarlyStopping

#for evaluation
from sklearn.metrics import f1_score


#*************************************************************#
                         #Import Data#
#*************************************************************#

#directory to dataset in drive
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

dataset_path = '/content/gdrive/My Drive/NewFolder/Dataset/'

Mounted at /content/gdrive


#This cell unzips the dataset downoaded from kaggle. It needs to be executed only once. to unzip
#!unzip '/content/gdrive/My Drive/NewFolder/archive (1).zip' -d '/content/gdrive/My Drive/NewFolder/'


#Refer to reference 8
#split into train, test and validation sets,for evaluation purposes
#the seed lets us reproduce the splits and this library allows randomized oversampling for imbalanced datasets.

!pip install split-folders
import splitfolders
splitfolders.ratio(dataset_path, output="output", seed=1345, ratio=(.8, 0.1, 0.1))

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting split-folders
  Downloading split_folders-0.5.1-py3-none-any.whl (8.4 kB)
Installing collected packages: split-folders
Successfully installed split-folders-0.5.1

Copying files: 6400 files [01:53, 56.59 files/s]


#Refer to reference 9, for code

#the dataset is now split between a training, test and validation set after using 'splitfolders' library above
#5,119 images in training, 642 images in test and 639 images in validation. 

#use SEED wherever possible. this eliminates the noise from random initialization which makes our model more robust

#models trained with small batch sizes generalize well on the validation set.
#for our coursework, we are training our model with the batch size ranging from 8 to 2048 

#batch size controls the accuracy of the estimate of the error gradient when training neural networks
#it also affects training time, and compute resources 

#Refer to Reference 10, for background

IMG_HEIGHT = 128
IMG_WIDTH = 128
train_ds = tf.keras.preprocessing.image_dataset_from_directory(
"./output/train",
seed=42,
image_size=(IMG_HEIGHT, IMG_WIDTH),
batch_size=32
)

test_ds = tf.keras.preprocessing.image_dataset_from_directory(
"./output/test",
seed=42,
image_size=(IMG_HEIGHT, IMG_WIDTH),
batch_size=32
)

val_ds = tf.keras.preprocessing.image_dataset_from_directory(
"./output/val",
seed=42,
image_size=(IMG_HEIGHT, IMG_WIDTH),
batch_size=32
)

Found 5119 files belonging to 4 classes.
Found 642 files belonging to 4 classes.
Found 639 files belonging to 4 classes.


#print the 4 class labels

class_names = train_ds.class_names
print(class_names)
train_ds

['Mild_Demented', 'Moderate_Demented', 'Non_Demented', 'Very_Mild_Demented']

<BatchDataset element_spec=(TensorSpec(shape=(None, 128, 128, 3), dtype=tf.float32, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))>


#return elements in the given positional indices (not actual values), along an axis
train_ds.take(1)

<TakeDataset element_spec=(TensorSpec(shape=(None, 128, 128, 3), dtype=tf.float32, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))>


#*************************************************************#
                         #Visualize the Images#
#*************************************************************#

#Refer to reference 9, for code

#what we have below is a 3D tensor of 8-bit integers. More precisely, it’s an array of 6400 matrices of,
#128 × 128 integers. Each such matrix is a grayscale image, with coefficients between 0 and 255.
#here, we'll display 9 of these images

plt.figure(figsize=(10, 10))
for images, labels in train_ds.take(1):
    for i in range(9):
        ax = plt.subplot(3, 3, i + 1)
        plt.imshow(images[i].numpy().astype("uint8"))#data type
        plt.title(class_names[labels[i]])
        plt.axis("off")


#*************************************************************#
        #Visualize the Data - Check for Imbalance#
#*************************************************************#

#check how images are distributed in 4 categories i.e. are they balanced?
#as can be seen below, the data is very imbalanced, 'moderate demented' being the minority class

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
size = [896,64,3200,2240]# refer to section 3-Data above, for counts in each category
ax.bar(class_names,size)
plt.show

<function matplotlib.pyplot.show(*args, **kw)>


class_names

['Mild_Demented', 'Moderate_Demented', 'Non_Demented', 'Very_Mild_Demented']


#*************************************************************#
        #Address Imbalance with SMOTE#
#*************************************************************#

#feeding imbalanced data to our classifier can make it biased in favor of the majority class, simply because it 
#did not have enough data to learn about the minority classes. hence our high level of imbalance can potentially
#impact our model performamce. this is certainly true, if we don't have enough samples to train with,
#such as our case

#Synthetic Minority Oversampling Technique (SMOTE) is one of the most commonly used statistical techniques 
#(oversampling method) used, to solve imbalance problems.  it aims to balance class distribution by randomly
#increasing the 3 minority class instances above ('mild_demented', 'moderate_demented' and 'very_mild_demented'), 
#by replicating them. SMOTE synthesises new minority instances between existing minority instances

#first, convert images into numpy arrays in order to apply SMOTE 

#next, we load each image into a list, and save the corresponding label in another list
data_src=dataset_path
data=[]
y=[]
for d in os.listdir(data_src):
    for file in os.listdir(data_src+d):
        data.append(Image.open(data_src+d+'/'+file))
        y.append(d)


#convert each image into a numpy array

X=[]
for im in data:
    X.append(np.array(im))


#convert the list of data into an array

X=np.array(X)


#check shape of numpy array

X.shape

(6400, 128, 128)


#*************************************************************#
                         #Rescaling#
#*************************************************************#

#refer to Reference 9 for code

#neural networks have difficulty adapting to heterogeneous data. the data will have to be normalized or rescaled.
#for consistency, test data is normalized using the mean and standard deviation of the training set.

#so, rescaling can be used to rescale pixel values from the range of 0-255 (0 is black and 255 is white, or min/ max), 
#to the range 0-1 (normalization) - this is the preferred (small) range for neural network models. 
#we rescale 1./255 to transform every pixel value from range [0,255] -> [0,1] 

#all features should ideally take the values in roughly the same range because inhomogeneity can trigger large 
#gradient updates and hamper conversions i.e. poor learning

#Divide by 255 to standardize the values between 0 and 1

X=X/255


#let's count how many of each category we have

non=0
mild=0
mod=0
vm=0
for cat in y:
    if cat=="Mild_Demented":
        mild+=1
    elif cat=="Moderate_Demented":
        mod+=1
    elif cat=="Non_Demented":
        non+=1
    else:
        vm+=1
print("Non Demented: ",non)
print("Very Mild: ",vm)
print("Moderate: ",mod)
print("Mild :",mild)

Non Demented:  3200
Very Mild:  2240
Moderate:  64
Mild : 896


#now we change these values/ labels to numerical ones so that we can use to_categorical
#0 will be non, 1 will be very mild, 2 will be mild, 3 will be moderate

y_num=[]
for cat in y:
    if cat=="Mild_Demented":
        y_num.append(2)
    elif cat=="Moderate_Demented":
        y_num.append(3)
    elif cat=="Non_Demented":
        y_num.append(0)
    else:
        y_num.append(1)


#convert to a categorical target as expected by keras

y=to_categorical(y_num)


#flatten the images, because smote accepts structural data as input

X_reshaped=X.reshape((6400,128*128))


#now apply smote to transform the dataset

sm=SMOTE(random_state=42)
train_data, train_labels = sm.fit_resample(X_reshaped, y)


#we can now see the number of images have doubled 

print(train_data.shape)

(12800, 16384)


#visualize how smote oversampled each class

y_oversampled=np.argmax(train_labels,axis=1)


#count the numbers in each category

hist_results=np.histogram(y_oversampled,bins=[0,1,2,3,4])
hist_results[0]

array([3200, 3200, 3200, 3200])


#0 will be non, 1 will be very mild, 2 will be mild, 3 will be moderate

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
size = hist_results[0]# refer to section 3-Data above, for counts in each category
ax.bar(class_names,size)
plt.show

<function matplotlib.pyplot.show(*args, **kw)>


#reshape to use in a neural network by batches (as we have a new shape after SMOTE)

batch_size=1
train_data_batched=train_data.reshape(12800,16384,batch_size)


#after SMOTE we need to split the dataset into training, validation and test sets

X_train, X_test, y_train, y_test = train_test_split(train_data_batched, train_labels, test_size=0.2, random_state=1)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2


#baseline - building the network
##the relu function returns the maximum of the input and 0 i.e. ReLU sets all negative values in the matrix/vector to zero and all other values are kept constant

model = Sequential([#create empty network object. sequential means the network is a series of transformational layers 
  layers.Flatten(),
  
  layers.Dense(64, activation='relu'),#each neural layer performs the computation/ activation, output = relu(dot(w, input) + b), mapping rank input tensors to rank output tensors. the second layer outputs a tensor with 64 elements. or, the representational space of this layer has 64 dimensions, the layer has 64 (hidden) units

  layers.Dense(32, activation='relu'),#the tensors are w, the tensor of weight parameters,b, the tensor of bias parameters, and x, the input tensor.The tensors are dotted, added,and a function f is applied. this layer performs a geometric transformation, resulting in a point in 32 dimensional space. this layer transforms an input tensor into a 1D tensor of length 32 - equivalent to a 32 dimeensional vector
  layers.Dense(4, activation='softmax')#softmax outputs a vector whose elements form a probability distribution (non negative floats that sum to one). outputs are interpreted as probabilities of membership of each class: the probability that the input sample is labeled '1/ mild demented', or '2/ moderate demented' and so on. outputs a 4 dimensional vector 
])


#baseline - compiling the model

model.compile(optimizer='rmsprop', #the optimizer (all sgd variants) makes parameter adjustments (layers, weights and biases) in the training loop and metrics reports on progress
              loss='categorical_crossentropy',#how far off the network prediction is from the target/ quantifies performance
              metrics=['accuracy'])#is the fraction of correctly classified samples

#'sparse' categorical crossentropy is used when classes are represented as integers 1,2,3,4 etc
#only categorical crossentropy is used when classes are one hot encoded i.e. 0100, 0001 and so forth.

#we have used categorical cross entropy due to utilising one hot encoded labels .this has no effect on model 
#performance. model performance will be the same in both cases if data is represented as integers or one hot 
#encoded.


#baseline - training the network

history = model.fit(X_train,y_train,validation_data=(X_val,y_val),epochs=4, verbose=1)

Epoch 1/4
240/240 [==============================] - 5s 6ms/step - loss: 1.7196 - accuracy: 0.3598 - val_loss: 1.1349 - val_accuracy: 0.5055
Epoch 2/4
240/240 [==============================] - 1s 4ms/step - loss: 0.9562 - accuracy: 0.5599 - val_loss: 0.9351 - val_accuracy: 0.5285
Epoch 3/4
240/240 [==============================] - 1s 4ms/step - loss: 0.8021 - accuracy: 0.6172 - val_loss: 0.6573 - val_accuracy: 0.6906
Epoch 4/4
240/240 [==============================] - 1s 4ms/step - loss: 0.7274 - accuracy: 0.6514 - val_loss: 0.7982 - val_accuracy: 0.6301


#*************************************************************#
                         #Building the Network#
#*************************************************************#

#in order to create a model which overfits, we require to have some reference model. hence, we can use the model 
#from reference [16]

#this references an MLP network, and is used for MNIST digits, size 784 x 128 x 64 x 10 = 64,225,280 
#we have taken a much larger network 16384 x 1024 x 512 x 128 x 4 = 4,398,046,511,104, which is 68,478 larger than
#the MNIST digits network. hence, more than capable of overfitting


model = Sequential([
  layers.Flatten(),
  layers.Dense(1024, activation='relu'),
  layers.Dense(512, activation='relu'),

  layers.Dense(128, activation='relu'),
  layers.Dense(4, activation='softmax')
])

#in summary, the 'output' layer for multi-classification tasks (such as this one), should have as many units as 
#there are classes i.e 4. use a softmax activation function for single-label multi-classification
#beware of creating information bottlenecks by defining small intermediate layers

#informtion dropped by a layer can never be recovered (could be a bottleneck). hence we require bigger layers 
#(more units) hence, our first layer we use 1024 unit layers

#affine transformations alone are unsufficient for data uncrumpling. we require a non-linear part, the activation i.e. relu
#without activations,layers would only perform affine transformations

#the batch size affects some indicators such as overall training time, training time per epoch, quality of the 
#model, and similar. usually, we choose the batch size as a power of two, in the range between 16 and 512. 
#but generally, the size of 32 is a rule of thumb and a good initial choice. it's the vector length
#the hypothesis space defined by the depth of network and the size and activation of each layer is arbitrary.there 
#is no formula. We have to rely on rules of thumb, best practices.
#each layer applies a few simple tensor operations, conceptually, the geometric transformation.
#transformations are parameterized by weights and biases.

#softmax heightens the larger elements of a vector whilst ensuring that the vector elements sum to one
#and, softmax retains information of the smaller elements.


#*************************************************************#
                         #Compiling the Model#
#*************************************************************#

#The training specification is made at compilation
#The optimizer, loss function, and a monitoring metric are specified when the network is compiled.

model.compile(optimizer='adam', 
              loss='categorical_crossentropy',
              metrics=['accuracy'])

#categorical crossentropy is the preferred loss for problems in which each sample belongs to one of several classes
#many optimizers attempt to avoid the local optimum problem by employing momentum. tweaks layer parameters in a 
#backwards pass default learning value or tweak?


#*************************************************************#
                         #Training the Network#
#*************************************************************#

#train the model for 80 epochs with the Keras Model.fit method:

#there are accuracy and loss metrics for both the training and validation data sets. the value of the accuracy 
#is a simple percentage measure of how many items the network got right. the value of loss is the 
#cross entropy loss.

#an epoch is a complete iteration over the entire training set. the loss is minimized by mini-batch 
#stochastic gradient descent, also known as SGD. weights and biases are adjusted after each mini-batch has been 
#processed

history = model.fit(X_train,y_train,validation_data=(X_val,y_val),epochs=80, verbose=1)#callbacks=callback

Epoch 1/80
240/240 [==============================] - 2s 6ms/step - loss: 1.3693 - accuracy: 0.5337 - val_loss: 0.7301 - val_accuracy: 0.6012
Epoch 2/80
240/240 [==============================] - 1s 4ms/step - loss: 0.6562 - accuracy: 0.6823 - val_loss: 0.6494 - val_accuracy: 0.6641
Epoch 3/80
240/240 [==============================] - 1s 4ms/step - loss: 0.6283 - accuracy: 0.7146 - val_loss: 0.5558 - val_accuracy: 0.7230
Epoch 4/80
240/240 [==============================] - 1s 4ms/step - loss: 0.5918 - accuracy: 0.7340 - val_loss: 0.6446 - val_accuracy: 0.7000
Epoch 5/80
240/240 [==============================] - 1s 4ms/step - loss: 0.5051 - accuracy: 0.7710 - val_loss: 0.4408 - val_accuracy: 0.7914
Epoch 6/80
240/240 [==============================] - 1s 4ms/step - loss: 0.4498 - accuracy: 0.7923 - val_loss: 0.4755 - val_accuracy: 0.7828
Epoch 7/80
240/240 [==============================] - 1s 4ms/step - loss: 0.4001 - accuracy: 0.8199 - val_loss: 0.3782 - val_accuracy: 0.8223
Epoch 8/80
240/240 [==============================] - 1s 4ms/step - loss: 0.3560 - accuracy: 0.8443 - val_loss: 0.3375 - val_accuracy: 0.8512
Epoch 9/80
240/240 [==============================] - 1s 4ms/step - loss: 0.3071 - accuracy: 0.8658 - val_loss: 0.4166 - val_accuracy: 0.7949
Epoch 10/80
240/240 [==============================] - 1s 4ms/step - loss: 0.2722 - accuracy: 0.8849 - val_loss: 0.2732 - val_accuracy: 0.8754
Epoch 11/80
240/240 [==============================] - 1s 4ms/step - loss: 0.2574 - accuracy: 0.8938 - val_loss: 0.3295 - val_accuracy: 0.8672
Epoch 12/80
240/240 [==============================] - 1s 4ms/step - loss: 0.2278 - accuracy: 0.9096 - val_loss: 0.2104 - val_accuracy: 0.9059
Epoch 13/80
240/240 [==============================] - 1s 4ms/step - loss: 0.3246 - accuracy: 0.8520 - val_loss: 0.3694 - val_accuracy: 0.8473
Epoch 14/80
240/240 [==============================] - 1s 4ms/step - loss: 0.3016 - accuracy: 0.8642 - val_loss: 0.2516 - val_accuracy: 0.8902
Epoch 15/80
240/240 [==============================] - 1s 4ms/step - loss: 0.7400 - accuracy: 0.6573 - val_loss: 0.9419 - val_accuracy: 0.4902
Epoch 16/80
240/240 [==============================] - 1s 4ms/step - loss: 0.6459 - accuracy: 0.6992 - val_loss: 0.3430 - val_accuracy: 0.8379
Epoch 17/80
240/240 [==============================] - 1s 4ms/step - loss: 0.3509 - accuracy: 0.8411 - val_loss: 0.5969 - val_accuracy: 0.7430
Epoch 18/80
240/240 [==============================] - 1s 4ms/step - loss: 0.3057 - accuracy: 0.8671 - val_loss: 0.2867 - val_accuracy: 0.8625
Epoch 19/80
240/240 [==============================] - 1s 4ms/step - loss: 0.2831 - accuracy: 0.8766 - val_loss: 0.4983 - val_accuracy: 0.8059
Epoch 20/80
240/240 [==============================] - 1s 4ms/step - loss: 0.2335 - accuracy: 0.9017 - val_loss: 0.2455 - val_accuracy: 0.8914
Epoch 21/80
240/240 [==============================] - 1s 4ms/step - loss: 0.2464 - accuracy: 0.8969 - val_loss: 0.2553 - val_accuracy: 0.8875
Epoch 22/80
240/240 [==============================] - 1s 4ms/step - loss: 0.1952 - accuracy: 0.9210 - val_loss: 0.2820 - val_accuracy: 0.8828
Epoch 23/80
240/240 [==============================] - 1s 4ms/step - loss: 0.2185 - accuracy: 0.9125 - val_loss: 0.2502 - val_accuracy: 0.8914
Epoch 24/80
240/240 [==============================] - 1s 4ms/step - loss: 0.2392 - accuracy: 0.9079 - val_loss: 0.2215 - val_accuracy: 0.9055
Epoch 25/80
240/240 [==============================] - 1s 4ms/step - loss: 0.1697 - accuracy: 0.9340 - val_loss: 0.2006 - val_accuracy: 0.9137
Epoch 26/80
240/240 [==============================] - 1s 4ms/step - loss: 0.2291 - accuracy: 0.9155 - val_loss: 0.2342 - val_accuracy: 0.8977
Epoch 27/80
240/240 [==============================] - 1s 4ms/step - loss: 0.2132 - accuracy: 0.9184 - val_loss: 0.2212 - val_accuracy: 0.9066
Epoch 28/80
240/240 [==============================] - 1s 4ms/step - loss: 0.1519 - accuracy: 0.9411 - val_loss: 0.2699 - val_accuracy: 0.8965
Epoch 29/80
240/240 [==============================] - 1s 4ms/step - loss: 0.1802 - accuracy: 0.9294 - val_loss: 0.1874 - val_accuracy: 0.9227
Epoch 30/80
240/240 [==============================] - 1s 4ms/step - loss: 0.1632 - accuracy: 0.9352 - val_loss: 0.2567 - val_accuracy: 0.9027
Epoch 31/80
240/240 [==============================] - 1s 4ms/step - loss: 0.1799 - accuracy: 0.9311 - val_loss: 0.2264 - val_accuracy: 0.9156
Epoch 32/80
240/240 [==============================] - 1s 4ms/step - loss: 0.1436 - accuracy: 0.9445 - val_loss: 0.2431 - val_accuracy: 0.9027
Epoch 33/80
240/240 [==============================] - 1s 4ms/step - loss: 0.1738 - accuracy: 0.9365 - val_loss: 0.2003 - val_accuracy: 0.9215
Epoch 34/80
240/240 [==============================] - 1s 4ms/step - loss: 0.1523 - accuracy: 0.9413 - val_loss: 0.2075 - val_accuracy: 0.9168
Epoch 35/80
240/240 [==============================] - 1s 4ms/step - loss: 0.1167 - accuracy: 0.9568 - val_loss: 0.1730 - val_accuracy: 0.9340
Epoch 36/80
240/240 [==============================] - 1s 4ms/step - loss: 0.1178 - accuracy: 0.9546 - val_loss: 0.4761 - val_accuracy: 0.8316
Epoch 37/80
240/240 [==============================] - 1s 4ms/step - loss: 0.1320 - accuracy: 0.9533 - val_loss: 0.1867 - val_accuracy: 0.9316
Epoch 38/80
240/240 [==============================] - 1s 4ms/step - loss: 0.0879 - accuracy: 0.9694 - val_loss: 0.1607 - val_accuracy: 0.9359
Epoch 39/80
240/240 [==============================] - 1s 4ms/step - loss: 0.1262 - accuracy: 0.9534 - val_loss: 0.1507 - val_accuracy: 0.9414
Epoch 40/80
240/240 [==============================] - 1s 4ms/step - loss: 0.0988 - accuracy: 0.9634 - val_loss: 0.1663 - val_accuracy: 0.9414
Epoch 41/80
240/240 [==============================] - 1s 4ms/step - loss: 0.3772 - accuracy: 0.8522 - val_loss: 0.6384 - val_accuracy: 0.6477
Epoch 42/80
240/240 [==============================] - 1s 4ms/step - loss: 0.4641 - accuracy: 0.7396 - val_loss: 0.6545 - val_accuracy: 0.6672
Epoch 43/80
240/240 [==============================] - 1s 4ms/step - loss: 0.4951 - accuracy: 0.7382 - val_loss: 0.4311 - val_accuracy: 0.7559
Epoch 44/80
240/240 [==============================] - 1s 4ms/step - loss: 0.3971 - accuracy: 0.7741 - val_loss: 0.3910 - val_accuracy: 0.7781
Epoch 45/80
240/240 [==============================] - 1s 4ms/step - loss: 0.3784 - accuracy: 0.7841 - val_loss: 0.3938 - val_accuracy: 0.7758
Epoch 46/80
240/240 [==============================] - 1s 4ms/step - loss: 0.4462 - accuracy: 0.7767 - val_loss: 0.3784 - val_accuracy: 0.7980
Epoch 47/80
240/240 [==============================] - 1s 4ms/step - loss: 0.3830 - accuracy: 0.7861 - val_loss: 0.3694 - val_accuracy: 0.7937
Epoch 48/80
240/240 [==============================] - 1s 4ms/step - loss: 0.4540 - accuracy: 0.7736 - val_loss: 0.3864 - val_accuracy: 0.7855
Epoch 49/80
240/240 [==============================] - 1s 4ms/step - loss: 0.3618 - accuracy: 0.8000 - val_loss: 0.3599 - val_accuracy: 0.7973
Epoch 50/80
240/240 [==============================] - 1s 4ms/step - loss: 0.4298 - accuracy: 0.7809 - val_loss: 0.3686 - val_accuracy: 0.8031
Epoch 51/80
240/240 [==============================] - 1s 4ms/step - loss: 0.3434 - accuracy: 0.8148 - val_loss: 0.3522 - val_accuracy: 0.8109
Epoch 52/80
240/240 [==============================] - 1s 4ms/step - loss: 0.3541 - accuracy: 0.8029 - val_loss: 0.3474 - val_accuracy: 0.8062
Epoch 53/80
240/240 [==============================] - 1s 4ms/step - loss: 0.3588 - accuracy: 0.8042 - val_loss: 0.3645 - val_accuracy: 0.7988
Epoch 54/80
240/240 [==============================] - 1s 4ms/step - loss: 0.3312 - accuracy: 0.8191 - val_loss: 0.3691 - val_accuracy: 0.7957
Epoch 55/80
240/240 [==============================] - 1s 4ms/step - loss: 0.4476 - accuracy: 0.7866 - val_loss: 0.3562 - val_accuracy: 0.8074
Epoch 56/80
240/240 [==============================] - 1s 4ms/step - loss: 0.3443 - accuracy: 0.8172 - val_loss: 0.3423 - val_accuracy: 0.8133
Epoch 57/80
240/240 [==============================] - 1s 4ms/step - loss: 0.3201 - accuracy: 0.8309 - val_loss: 0.3498 - val_accuracy: 0.8156
Epoch 58/80
240/240 [==============================] - 1s 4ms/step - loss: 0.3388 - accuracy: 0.8271 - val_loss: 0.3405 - val_accuracy: 0.8234
Epoch 59/80
240/240 [==============================] - 1s 4ms/step - loss: 0.3928 - accuracy: 0.8113 - val_loss: 0.4061 - val_accuracy: 0.7844
Epoch 60/80
240/240 [==============================] - 1s 4ms/step - loss: 0.3133 - accuracy: 0.8414 - val_loss: 0.3462 - val_accuracy: 0.8207
Epoch 61/80
240/240 [==============================] - 1s 4ms/step - loss: 0.3235 - accuracy: 0.8346 - val_loss: 0.3172 - val_accuracy: 0.8367
Epoch 62/80
240/240 [==============================] - 1s 4ms/step - loss: 0.3266 - accuracy: 0.8380 - val_loss: 0.4751 - val_accuracy: 0.7781
Epoch 63/80
240/240 [==============================] - 1s 4ms/step - loss: 0.3383 - accuracy: 0.8342 - val_loss: 0.5884 - val_accuracy: 0.7211
Epoch 64/80
240/240 [==============================] - 1s 4ms/step - loss: 0.4937 - accuracy: 0.7839 - val_loss: 0.3353 - val_accuracy: 0.8344
Epoch 65/80
240/240 [==============================] - 1s 4ms/step - loss: 0.3086 - accuracy: 0.8483 - val_loss: 0.4384 - val_accuracy: 0.7824
Epoch 66/80
240/240 [==============================] - 1s 4ms/step - loss: 0.3059 - accuracy: 0.8471 - val_loss: 0.3159 - val_accuracy: 0.8473
Epoch 67/80
240/240 [==============================] - 1s 4ms/step - loss: 0.4006 - accuracy: 0.8198 - val_loss: 0.3212 - val_accuracy: 0.8434
Epoch 68/80
240/240 [==============================] - 1s 4ms/step - loss: 0.2855 - accuracy: 0.8620 - val_loss: 0.3491 - val_accuracy: 0.8328
Epoch 69/80
240/240 [==============================] - 1s 4ms/step - loss: 0.3101 - accuracy: 0.8486 - val_loss: 0.3134 - val_accuracy: 0.8512
Epoch 70/80
240/240 [==============================] - 1s 4ms/step - loss: 0.2986 - accuracy: 0.8578 - val_loss: 0.3047 - val_accuracy: 0.8516
Epoch 71/80
240/240 [==============================] - 1s 4ms/step - loss: 0.2770 - accuracy: 0.8672 - val_loss: 0.3497 - val_accuracy: 0.8250
Epoch 72/80
240/240 [==============================] - 1s 4ms/step - loss: 0.3284 - accuracy: 0.8522 - val_loss: 0.3024 - val_accuracy: 0.8523
Epoch 73/80
240/240 [==============================] - 1s 4ms/step - loss: 0.2853 - accuracy: 0.8654 - val_loss: 0.3435 - val_accuracy: 0.8297
Epoch 74/80
240/240 [==============================] - 1s 4ms/step - loss: 0.3135 - accuracy: 0.8527 - val_loss: 0.3044 - val_accuracy: 0.8602
Epoch 75/80
240/240 [==============================] - 1s 4ms/step - loss: 0.2686 - accuracy: 0.8803 - val_loss: 0.3399 - val_accuracy: 0.8387
Epoch 76/80
240/240 [==============================] - 1s 4ms/step - loss: 0.2884 - accuracy: 0.8628 - val_loss: 0.3098 - val_accuracy: 0.8555
Epoch 77/80
240/240 [==============================] - 1s 4ms/step - loss: 0.2579 - accuracy: 0.8824 - val_loss: 0.2905 - val_accuracy: 0.8695
Epoch 78/80
240/240 [==============================] - 1s 4ms/step - loss: 0.3785 - accuracy: 0.8436 - val_loss: 0.2963 - val_accuracy: 0.8621
Epoch 79/80
240/240 [==============================] - 1s 4ms/step - loss: 0.2841 - accuracy: 0.8690 - val_loss: 0.3012 - val_accuracy: 0.8605
Epoch 80/80
240/240 [==============================] - 1s 4ms/step - loss: 0.2464 - accuracy: 0.8866 - val_loss: 0.2862 - val_accuracy: 0.8703


#******************************************************************#
#Visualise Results with Training and Validation Accuracy/Loss Plots#
#******************************************************************#

#once the model is trained, it is possible to call its history method to get a dictionary of the loss and any 
#other metrics needed at every stage of the training

#as we see from the "Training and Validation Accuracy" plots below, training accuracy is clearly greater than 
#validation accuracy, for all epochs. with exception of the first 15 epochs, (which are stabilizing) 
#hence the result of these initial 15 epochs do not express the network state very well 

#this is the outcome, as the model can't generalize well. such models could mitigate overfitting with addition of 
#dropout, which we'll demonstrate below

#finally, we could possibly train this model longer as it doesn’t look like the loss has reached a minimum

#this means that the model will have a difficult time generalizing on a new dataset, hence we can't evaluate 
#on the test set, just yet. hence, we now require to increase the overall performance of the model


acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs_range = range(80)

plt.figure(figsize=(8, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()


#*************************************************************#
                 #Overfitted Model Summary#
#*************************************************************#

#view all the layers of the network using the Keras Model.summary method:
#model.summary shows the output shape of each layer and the number of trainable parameters, the number of 
#parameters in the W and B layer tensors

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 flatten_1 (Flatten)         (32, 16384)               0         
                                                                 
 dense_3 (Dense)             (32, 1024)                16778240  
                                                                 
 dense_4 (Dense)             (32, 512)                 524800    
                                                                 
 dense_5 (Dense)             (32, 128)                 65664     
                                                                 
 dense_6 (Dense)             (32, 4)                   516       
                                                                 
=================================================================
Total params: 17,369,220
Trainable params: 17,369,220
Non-trainable params: 0
_________________________________________________________________


#make predictions
y_pred=model.predict(X_test)

80/80 [==============================] - 0s 2ms/step


#confusion matrix provides an overview of how well the overfitted model predicts the four classes

cm = confusion_matrix(np.argmax(y_test,axis=-1), np.argmax(y_pred,axis=-1))

disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=class_names)
disp.plot()
plt.rcParams["figure.figsize"] = (10,10)
plt.show()


#the classification report provides the f1 score

print(classification_report(np.argmax(y_test,axis=-1), np.argmax(y_pred,axis=-1), target_names=class_names))

                    precision    recall  f1-score   support

     Mild_Demented       0.73      0.71      0.72       633
 Moderate_Demented       0.74      0.75      0.75       646
      Non_Demented       0.99      0.98      0.98       669
Very_Mild_Demented       1.00      1.00      1.00       612

          accuracy                           0.86      2560
         macro avg       0.86      0.86      0.86      2560
      weighted avg       0.86      0.86      0.86      2560


#the weight tensor has shape N, M, a matrix with N rows and M columns. therefore, there are N × M elements.
#the bias tensor is just a vector so the number of elements is equal to the length. the total number of trainable 
#parameters, the total number of elements in all the W and V tensors is therefore given by the sum which evaluates
#to 6,302 [Ref 17]

#def print_layer_tensor_shape(layer):

    #weight_params=model.layers[layer].get_weights()[0]
    #bias_params=model.layers[layer].get_weights()[1]
    #print(layer, '\t', weight_params.shape, '\t', bias_params.shape)

#print_layer_tensor_shape(layer=0)
#print_layer_tensor_shape(layer=1)
#print_layer_tensor_shape(layer=2)
#print_layer_tensor_shape(layer=3)

1 	 (16384, 1024) 	 (1024,)
2 	 (1024, 512) 	 (512,)
3 	 (512, 128) 	 (128,)


#define function for hyperparameter tuning

#plus, other changes made to reduce overfitting:

# 1) model size reduced drastically, around 10 times, as previous model was really big. 
# current model size is 16384 x 128 x 64 x 32 x 4. 

# 2) add dropout to mitigate overfitting

# 3) add batch normalisation to mitigate overfitting

def create_model(activation_function,optimizer,dropout_rate):

	#create model - add dropout layers between each pair of dense layers for regularization, into previous model above
	#the dropout layer takes an argument 'rate', which specifies the proportion of neurons in the preceding dense
	#layer,that should take a value of zero.  in this model, the rate is set to 0.5, which means 50% of the neurons
	#in the hidden layers are given a weight of 0.
  
	model = Sequential([
			#create empty network object. sequential means the network is a series of transformational layers 
	layers.Flatten(),
	Dropout(dropout_rate),
	BatchNormalization(),
	layers.Dense(128, activation=activation_function),
  Dropout(dropout_rate),
	BatchNormalization(),
	layers.Dense(64, activation=activation_function),
	Dropout(dropout_rate),
	BatchNormalization(),
	layers.Dense(32, activation=activation_function),
	Dropout(dropout_rate),
	BatchNormalization(),
	layers.Dense(4, activation='softmax'),#outputs a vector whose elements form a probability distribution (non negative floats that sum to one). outputs are interpreted as probabilities of membership of each class: the probability that the input sample is labeled '1/ mild demented', or '2/ moderate demented' and so on 
	])
    
	if(optimizer=="SGD"):  		
		opt=keras.optimizers.SGD(learning_rate=0.01,momentum=0.9)
	elif(optimizer=="adam"):		
		opt=keras.optimizers.Adam(learning_rate=0.01)
	elif(optimizer=="RMSprop"):		
		opt=keras.optimizers.RMSprop(learning_rate=0.01)
	
	model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])		
 	
	return model


# For additional hyper parameter tuning we have added 3 parameters to control 

# 1) Activation functions - 'tanh' and 'sigmoid' may be work better than relu in some cases, but generally we know 
#that relu is the rule of thumb
activation_function_arr=['relu','tanh','sigmoid']

# 2) Different optimizers - 'adam' is the typical choice when training the larger network, but if the network is 
#simple, maybe SGD and RMSprop converge faster
optimizer_arr=["SGD","adam","RMSprop"]

# 3) Different dropout rates - dropout rate is the parameter which we can't anticipate which value will work best, 
#so we require to trial
dropout_rate_arr=[0.3,0.4,0.5]
epochs=80

#use Early Stopping
#this is a regularization technique to avoid overfitting when training a learner with an iterative method,
#such as hold out validation. used for deep neural networks and stops training when parameter updates no longer 
#begin to yield improves on a validation set

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,patience=10)


#execute hyper parameter search using for loops (as opposed to the library, Gridsearch)

f1score_max=0
best_hyper_params=[]
best_history=None
iter=0
best_model=None
for activation_function in activation_function_arr :
  for optimizer in optimizer_arr :
    for dropout_rate in dropout_rate_arr :
      print("iter=",iter,"hp vapues=",[activation_function,optimizer,dropout_rate])
      model=create_model(activation_function,optimizer,dropout_rate)
      history = model.fit(X_train,y_train,validation_data=(X_val,y_val),epochs=epochs, verbose=0,callbacks=[es])#callbacks=callback
      y_pred=model.predict(X_test)

      f1score=f1_score(np.argmax(y_test,axis=-1), np.argmax(y_pred,axis=-1), average='weighted')
      print("f1 score for iter ",iter,"= ",f1score)
      iter=iter+1
      #Test for maximum accuracy
      if(f1score>f1score_max):
        f1score_max=f1score
        best_hyper_params=[activation_function,optimizer,dropout_rate]
        best_history=history
        best_model=model

print("Hyper parameter searching process is finished")
print("Best set of hyper parameteres=",best_hyper_params)
print("Best F1 score value=",f1score_max)

iter= 0 hp vapues= ['relu', 'SGD', 0.3]
Epoch 99: early stopping
80/80 [==============================] - 0s 2ms/step
f1 score for iter  0 =  0.9949214402930922
iter= 1 hp vapues= ['relu', 'SGD', 0.4]
Epoch 94: early stopping
80/80 [==============================] - 0s 2ms/step
f1 score for iter  1 =  0.9839610539329501
iter= 2 hp vapues= ['relu', 'SGD', 0.5]
80/80 [==============================] - 0s 2ms/step
f1 score for iter  2 =  0.90594720001182
iter= 3 hp vapues= ['relu', 'adam', 0.3]
Epoch 73: early stopping
80/80 [==============================] - 0s 2ms/step
f1 score for iter  3 =  0.9033091759562593
iter= 4 hp vapues= ['relu', 'adam', 0.4]
Epoch 17: early stopping
80/80 [==============================] - 0s 2ms/step
f1 score for iter  4 =  0.6875440179840053
iter= 5 hp vapues= ['relu', 'adam', 0.5]
Epoch 25: early stopping
80/80 [==============================] - 0s 2ms/step
f1 score for iter  5 =  0.6537049820103503
iter= 6 hp vapues= ['relu', 'RMSprop', 0.3]
Epoch 47: early stopping
80/80 [==============================] - 0s 2ms/step
f1 score for iter  6 =  0.9507013296647759
iter= 7 hp vapues= ['relu', 'RMSprop', 0.4]
Epoch 45: early stopping
80/80 [==============================] - 0s 2ms/step
f1 score for iter  7 =  0.9182401701426028
iter= 8 hp vapues= ['relu', 'RMSprop', 0.5]
Epoch 96: early stopping
80/80 [==============================] - 0s 2ms/step
f1 score for iter  8 =  0.8588173926065217
iter= 9 hp vapues= ['tanh', 'SGD', 0.3]
80/80 [==============================] - 0s 2ms/step
f1 score for iter  9 =  0.9898276252785572
iter= 10 hp vapues= ['tanh', 'SGD', 0.4]
80/80 [==============================] - 0s 2ms/step
f1 score for iter  10 =  0.9777208336785286
iter= 11 hp vapues= ['tanh', 'SGD', 0.5]
80/80 [==============================] - 0s 2ms/step
f1 score for iter  11 =  0.9193517861083553
iter= 12 hp vapues= ['tanh', 'adam', 0.3]
Epoch 19: early stopping
80/80 [==============================] - 0s 2ms/step
f1 score for iter  12 =  0.6169669909267844
iter= 13 hp vapues= ['tanh', 'adam', 0.4]
Epoch 14: early stopping
80/80 [==============================] - 0s 2ms/step
f1 score for iter  13 =  0.5345518272310239
iter= 14 hp vapues= ['tanh', 'adam', 0.5]
Epoch 18: early stopping
80/80 [==============================] - 0s 2ms/step
f1 score for iter  14 =  0.44506122909567286
iter= 15 hp vapues= ['tanh', 'RMSprop', 0.3]
Epoch 21: early stopping
80/80 [==============================] - 0s 2ms/step
f1 score for iter  15 =  0.6136093500612848
iter= 16 hp vapues= ['tanh', 'RMSprop', 0.4]
Epoch 15: early stopping
80/80 [==============================] - 0s 2ms/step
f1 score for iter  16 =  0.600691326991102
iter= 17 hp vapues= ['tanh', 'RMSprop', 0.5]
Epoch 18: early stopping
80/80 [==============================] - 0s 2ms/step
f1 score for iter  17 =  0.5663754041247115
iter= 18 hp vapues= ['sigmoid', 'SGD', 0.3]
80/80 [==============================] - 0s 2ms/step
f1 score for iter  18 =  0.9941448199888537
iter= 19 hp vapues= ['sigmoid', 'SGD', 0.4]
80/80 [==============================] - 0s 2ms/step
f1 score for iter  19 =  0.9652238374113731
iter= 20 hp vapues= ['sigmoid', 'SGD', 0.5]
80/80 [==============================] - 0s 2ms/step
f1 score for iter  20 =  0.8376928451326799
iter= 21 hp vapues= ['sigmoid', 'adam', 0.3]
Epoch 14: early stopping
80/80 [==============================] - 0s 2ms/step
f1 score for iter  21 =  0.4514764527601846
iter= 22 hp vapues= ['sigmoid', 'adam', 0.4]
Epoch 16: early stopping
80/80 [==============================] - 0s 2ms/step
f1 score for iter  22 =  0.49001929426546553
iter= 23 hp vapues= ['sigmoid', 'adam', 0.5]
Epoch 22: early stopping
80/80 [==============================] - 0s 2ms/step
f1 score for iter  23 =  0.5328087022779782
iter= 24 hp vapues= ['sigmoid', 'RMSprop', 0.3]
Epoch 37: early stopping
80/80 [==============================] - 0s 2ms/step
f1 score for iter  24 =  0.6326197042018332
iter= 25 hp vapues= ['sigmoid', 'RMSprop', 0.4]
Epoch 23: early stopping
80/80 [==============================] - 0s 2ms/step
f1 score for iter  25 =  0.5825887413382798
iter= 26 hp vapues= ['sigmoid', 'RMSprop', 0.5]
Epoch 14: early stopping
80/80 [==============================] - 0s 2ms/step
f1 score for iter  26 =  0.6000927949696307
Hyper parameter searching process is finished
Best set of hyper parameteres= ['relu', 'SGD', 0.3]
Best F1 score value= 0.9949214402930922


# Evaluate the best model which we found by hyper parameter optimisation

model=create_model('relu','SGD',0.3)#found these 3 hyperparameters during execution of hold out validation
history = model.fit(X_train,y_train,validation_data=(X_val,y_val),epochs=epochs, verbose=0,callbacks=[es])#callbacks=callback

y_pred=best_model.predict(X_test)

cm = confusion_matrix(np.argmax(y_test,axis=-1), np.argmax(y_pred,axis=-1))

disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=class_names)
disp.plot()
plt.rcParams["figure.figsize"] = (10,10)
plt.show()

Epoch 94: early stopping
80/80 [==============================] - 0s 2ms/step


print(classification_report(np.argmax(y_test,axis=-1), np.argmax(y_pred,axis=-1), target_names=class_names))

                    precision    recall  f1-score   support

     Mild_Demented       1.00      0.98      0.99       633
 Moderate_Demented       0.98      1.00      0.99       646
      Non_Demented       1.00      1.00      1.00       669
Very_Mild_Demented       1.00      1.00      1.00       612

          accuracy                           0.99      2560
         macro avg       0.99      0.99      0.99      2560
      weighted avg       0.99      0.99      0.99      2560

Predicting the Early Stages of Alzheimer's Disease to Potentially¶

Prevent or Delay the Development of Dementia - Part 1¶

Utilising the Multi-Class, Single Label, Sequential Model Architecture (and Dropout)¶

1. Introduction¶

2. Objectives¶

3. Dataset¶

4. Evaluation Methodology (Choosing a Measure of Success)¶

5. Deciding on an Evaluation Protocol¶

6. Data Preprocessing¶

7. Baseline Performance¶

8. Scaling Up - Developing a Model that Overfits¶

Building the Network¶

Compiling the Model¶

Training the Network¶

9. Hyperparameter Tuning and Regularization¶

Hyperparameter Tuning¶

Adding Dropout (Regularization Technique)¶

10. Evaluation¶

11. Results¶

12. Conclusions¶

References:¶