# importing libraries
import pandas as pd, numpy as np
import statistics


# to ignore any warning messages.
import warnings
warnings.filterwarnings('ignore')

# for visualization
import matplotlib
import matplotlib.pyplot as plt, seaborn as sns

# for unit testing
from unittest.mock import patch, Mock
import unittest

import re
import nltk
nltk.download('stopwords')
import matplotlib.pyplot as plt
pd.options.display.max_colwidth = 200
%matplotlib inline
import os
import textwrap

from platform import python_version

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/stephenmalcolm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Version information of libraries
print('Pandas Version: ', pd.__version__)
print('Numpy Version: ', np.__version__)
print('Matplotlib Version: ', matplotlib.__version__)
print('Seaborn Version: ', sns.__version__)
print('Python Version: ', python_version())

Pandas Version:  1.4.4
Numpy Version:  1.24.4
Matplotlib Version:  3.7.1
Seaborn Version:  0.12.2
Python Version:  3.8.16


#first, check we're using the right file type/encoder i.e. it should be UTF-8
#this ensures, it's relatively clean, and can be read and organised (as opposed to some other unusable formats)
import pandas as pd
import csv
data = open('mental-health-survey.csv', 'r')
data

<_io.TextIOWrapper name='mental-health-survey.csv' mode='r' encoding='UTF-8'>


#we have the correct file encoder, as stated in the above output
#now, import 

import csv
df=pd.read_csv('mental-health-survey.csv')

df.head()._append(df.tail())


#after reviewing the analysis from the survey column, we should have 102 respondants (rows) and 10 questions 
#(columns) meaning, that all of our survey data is now stored in that DataFrame and ready for analysis
#those extra columns (see output below), highlights we have to drop several columns
#let's address further into the preprocessing exercise


#Let's check the number of rows and columns
df.shape

(102, 27)


#Let's check for missing values (Nan) and provide a copy or list of the original questions
#From above output, we have 27 columns
df.isnull().sum()

Unique Response Number                                                                                       0
1. To what age group do you belong?                                                                          0
2. Which gender do you most strongly identify with?                                                          0
3. What is your region of residence?                                                                         0
4. Has the covid-19 lockdown affected your state of mind in any way?                                         0
5. Do you share a home with any of the following?                                                            0
5.a. If you selected Other, please specify:                                                                102
6. If you compare the pre-COVID-19 lockdown and the actual COVID-19 lockdown, how has:                     102
6.1. Your amount of communication with family and friends changed?                                           0
6.2. Your financial status changed?                                                                          0
7. During the Covid 19 lockdown, please state the effects and restraints on your wellbeing:                102
7.1. How often did you feel unhappy/ depressed?                                                              0
7.2. How often did you feel anxious/ a loss of confidence?                                                   0
7.3. How often did you feel under stress?                                                                    0
8. During the Covid 19 lockdown, please state the effects and restraints on your daily life:               102
8.1. How satisfied were you with your study and/ or work?                                                    0
8.2. How satisfied were you with daily life activities?                                                      0
8.3. How satisfied were you with new activities?                                                             0
9. How many hours per day are/ were you able to spend outside, on average during the lockdown?               0
10. Select if you have had any of these psychological reactions towards the covid-19 pandemic?               0
10.a. If you selected Other, please specify:                                                               102
11. During the Covid 19 lockdown, which of the following helped you feel better?                             0
11.a. If you selected Other, please specify:                                                               102
12. Have you been tempted to access support from social groups during the covid-19 pandemic?                 0
12.a. If you selected Other, please specify:                                                               102
13. Do you feel that the Covid 19 pandemic has affected your access to healthcare for other conditions?      0
13.a. If you selected Other, please specify:                                                               102
dtype: int64


#lets create a data type function for displaying null values and data types
def finding_null_value(df):

    total_null = df.isnull().sum()           

    total_percent = (df.isnull().sum()/df.isnull().count()*100) 
    
    
    new_var = pd.concat([total_null, total_percent], axis=1, keys=['Total_null', 'Total_percent(%)'])


    types_array = []
    for column in df.columns:
        dtype = str(df[column].dtype)
        types_array.append(dtype)
    new_var['Types'] = types_array

    return(np.transpose(new_var))


#The data type function displays:
# 1. Total null values 
# 2. Total percentage 
# 3. Display types of every feature (don't need to use Dtypes command, however, we'll still demonstrate below) 
finding_null_value(df)


#Need to rename headers, to make it tidier and coherent

df.columns = ['URN','Age', 'Gender', 'Region', 'SOM_Affected', 'Home Sharing', 'Q5a', 'Q6', 
              'Impact on Communication', 'Impact on Financial Status', 'Q7', 'Unhappy_or_Depressed',
              'Anxious or Low Confidence', 'Stressed', 'Q8', 'Study or Work Satisfaction', 'Routine Satisfaction',
              'New Activities Satisfaction', 'Hours Spent Outside', 'Psychological_Reactions', 'Q10a',
              'Supported_Activities', 'Q11a', 'Social_Support', 'Q12a', 'Healthcare_Access', 'Q13a'] 
df.head()


#The respondents didn't bother populating any of the 'Other' options. Hence, we have missing values here
#Questions 6, 7 and 8 are general overview questions, that don't require an answer. 
#Hence, these are Q5a, Q6, Q7, Q8, Q10a, Q11a, Q12a and Q13a
#Drop these columns

df = df.dropna(axis=1)


#We should delete the unique response number/ URN column
#it serves no purpose

df.drop('URN', axis=1, inplace=True)
df


#Very quick and dirty 'summary of statistics'
#Statistical summary for categorical or string variables will show “count”, “unique”, “top”, and “freq”.

df.describe(include='all').T


#Preparing a separate dataframe for later

df_new=df.copy()


#Present types of categories here or use function above

df.dtypes

Age                            object
Gender                         object
Region                         object
SOM_Affected                   object
Home Sharing                   object
Impact on Communication        object
Impact on Financial Status     object
Unhappy_or_Depressed           object
Anxious or Low Confidence      object
Stressed                       object
Study or Work Satisfaction     object
Routine Satisfaction           object
New Activities Satisfaction    object
Hours Spent Outside            object
Psychological_Reactions        object
Supported_Activities           object
Social_Support                 object
Healthcare_Access              object
dtype: object


#Set the nominal (non-ordered categorical) data types from object to category type

df['Gender']=df['Gender'].astype('category')
df['Region']=df['Region'].astype('category')
df.dtypes

Age                              object
Gender                         category
Region                         category
SOM_Affected                     object
Home Sharing                     object
Impact on Communication          object
Impact on Financial Status       object
Unhappy_or_Depressed             object
Anxious or Low Confidence        object
Stressed                         object
Study or Work Satisfaction       object
Routine Satisfaction             object
New Activities Satisfaction      object
Hours Spent Outside              object
Psychological_Reactions          object
Supported_Activities             object
Social_Support                   object
Healthcare_Access                object
dtype: object


#Verify dtype
#This gives us an extra step to check that there's no mistakes or no unexpected values in our data
df['Gender'].dtype

CategoricalDtype(categories=['Female', 'Male'], ordered=False)


#Perform the same for the Region column
df['Region'].dtype

CategoricalDtype(categories=['Africa', 'Asia', 'Europe', 'North America', 'Oceania',
                  'South America'],
, ordered=False)


#Ordinal (ordinal categorical)/ Questions 6.1, 6.2 and 7.1, 7.2, 7.3 and 8.1, 8.2, 8.3
#Using a likert scale

#Sometimes when setting categorical variables, it is also necessary to specify all of the categories explicitly.
#This is very often the case for ordinal variables, as well as specifying all of the categories that are permitted 
#in a particular ordinal variable, we also need to specify what ordering they follow. This is the case for the 
#key variables in this dataset, which relates to 8 different columns/questions.

#Q6.1 - Set up new variable

friends_family = ['same as usual', 'less than usual', 'more than usual']

#Set correct data types
df['Impact on Communication'] = df['Impact on Communication'].astype(pd.CategoricalDtype(ordered=True, categories=friends_family))

#Inspect dtype
df['Impact on Communication'].dtype

CategoricalDtype(categories=['same as usual', 'less than usual', 'more than usual'], ordered=True)


#Ordinal (ordinal categorical)
#Q6.2 - Set up new variable

money = ['same as usual', 'less than usual', 'more than usual']

#Set correct data types
df['Impact on Financial Status'] = df['Impact on Financial Status'].astype(pd.CategoricalDtype(ordered=True, categories=money))

#Inspect dtype
df['Impact on Communication'].dtype

CategoricalDtype(categories=['same as usual', 'less than usual', 'more than usual'], ordered=True)


#Ordinal (ordinal categorical)
#Q7.1 - Set up new variable

unhappy = ['almost always', 'often', 'sometimes', 'seldom', 'never']

#Set correct data types
df['Unhappy_or_Depressed'] = df['Unhappy_or_Depressed'].astype(pd.CategoricalDtype(ordered=True, categories=unhappy))

#Inspect dtype
df['Unhappy_or_Depressed'].dtype

CategoricalDtype(categories=['almost always', 'often', 'sometimes', 'seldom', 'never'], ordered=True)


#Ordinal (ordinal categorical)
#Q7.2 - Set up new variable

anxious = ['almost always', 'often', 'sometimes', 'seldom', 'never']

#Set correct data types
df['Anxious or Low Confidence'] = df['Anxious or Low Confidence'].astype(pd.CategoricalDtype(ordered=True, categories=anxious))

#Inspect dtype
df['Anxious or Low Confidence'].dtype

CategoricalDtype(categories=['almost always', 'often', 'sometimes', 'seldom', 'never'], ordered=True)


#Ordinal (ordinal categorical)
#Q7.3 - Set up new variable

stress = ['almost always', 'often', 'sometimes', 'seldom', 'never']

#Set correct data types
df['Stressed'] = df['Unhappy_or_Depressed'].astype(pd.CategoricalDtype(ordered=True, categories=stress))

#Inspect dtype
df['Unhappy_or_Depressed'].dtype

CategoricalDtype(categories=['almost always', 'often', 'sometimes', 'seldom', 'never'], ordered=True)


#Ordinal (ordinal categorical)
#Q8.1 - Set up new variable

study_work = ['very unsatisfied', 'unsatisfied', 'neutral', 'satisfied', 'very satisfied']

#Set correct data types
df['Routine Satisfaction'] = df['Routine Satisfaction'].astype(pd.CategoricalDtype(ordered=True, categories=study_work))

#Inspect dtype
df['Routine Satisfaction'].dtype

CategoricalDtype(categories=['very unsatisfied', 'unsatisfied', 'neutral', 'satisfied',
                  'very satisfied'],
, ordered=True)


#Ordinal (ordinal categorical)
#Q8.2 - Set up new variable

routine = ['very unsatisfied', 'unsatisfied', 'neutral', 'satisfied', 'very satisfied']

#Set correct data types
df['Study or Work Satisfaction'] = df['Study or Work Satisfaction'].astype(pd.CategoricalDtype(ordered=True, categories=routine))

#Inspect dtype
df['Study or Work Satisfaction'].dtype

CategoricalDtype(categories=['very unsatisfied', 'unsatisfied', 'neutral', 'satisfied',
                  'very satisfied'],
, ordered=True)


#Ordinal (ordinal categorical)
#Q8.3 (Likert)- Set up new variable

new_activity = ['very unsatisfied', 'unsatisfied', 'neutral', 'satisfied', 'very satisfied']

#Set correct data types
df['New Activities Satisfaction'] = df['New Activities Satisfaction'].astype(pd.CategoricalDtype(ordered=True, categories=new_activity))

#Inspect dtype
df['New Activities Satisfaction'].dtype

CategoricalDtype(categories=['very unsatisfied', 'unsatisfied', 'neutral', 'satisfied',
                  'very satisfied'],
, ordered=True)


#Ordinal (ordinal categorical)
#Q5 (Checkbox) - Set up new variable
#This particular checkbox must be ordered, or the barchart wont be interpreted correctly

outside = ['0-1 hours per day', '2-3 hours per day', '4-5 hours per day', '6-8 hours per day', 'More than 8 hours per day']

#Set correct data types
df['Hours Spent Outside'] = df['Hours Spent Outside'].astype(pd.CategoricalDtype(ordered=True, categories=outside))

#Inspect dtype
df['Hours Spent Outside'].dtype

CategoricalDtype(categories=['0-1 hours per day', '2-3 hours per day',
                  '4-5 hours per day', '6-8 hours per day',
                  'More than 8 hours per day'],
, ordered=True)


#Ordinal (ordinal categorical)
#Q1 - Set up new variable

age = ['Under 18', '18-24', '25-34', '35-44', '45-54', '55-64', '65+']

#Set correct data types
df['Age'] = df['Age'].astype(pd.CategoricalDtype(ordered=True, categories=age))

#Inspect dtype
df['Age'].dtype

CategoricalDtype(categories=['Under 18', '18-24', '25-34', '35-44', '45-54', '55-64',
                  '65+'],
, ordered=True)


#We can see below, the specified objects that were once object, have now been transformed into category

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102 entries, 0 to 101
Data columns (total 18 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   Age                          102 non-null    category
 1   Gender                       102 non-null    category
 2   Region                       102 non-null    category
 3   SOM_Affected                 102 non-null    object  
 4   Home Sharing                 102 non-null    object  
 5   Impact on Communication      0 non-null      category
 6   Impact on Financial Status   0 non-null      category
 7   Unhappy_or_Depressed         0 non-null      category
 8   Anxious or Low Confidence    0 non-null      category
 9   Stressed                     0 non-null      category
 10  Study or Work Satisfaction   0 non-null      category
 11  Routine Satisfaction         0 non-null      category
 12  New Activities Satisfaction  0 non-null      category
 13  Hours Spent Outside          102 non-null    category
 14  Psychological_Reactions      102 non-null    object  
 15  Supported_Activities         102 non-null    object  
 16  Social_Support               102 non-null    object  
 17  Healthcare_Access            102 non-null    object  
dtypes: category(12), object(6)
memory usage: 8.5+ KB


#Before we dive in to the EDA, first, let's implement an error handling procedure to handle exceptional 
#conditions, with a 'Try Except' statement.
#Try Except

def counts(col):
    try:
        counts_new=df_new[col].value_counts(sort=False)
        return counts_new
    except:
        print("Write appropriate column Name")


#Checking the Gender categories
counts("Gender")

Male      56
Female    46
Name: Gender, dtype: int64


#I will deliberately use the wrong column which is not in the dataset. I want to verify that the "Try Except" code 
#works. And you can see that this message outputs "Write appropriate column Name".
counts("Education")

Write appropriate column Name


check = counts("Stressed") #This is real variable/ column name
check

Sometimes        48
Often            24
Almost Always     6
Seldom           24
Name: Stressed, dtype: int64


#Lets run a Unit Test for the 'Stressed' variable
from pandas._testing import assert_frame_equal 
def test_counts(col):
    df_counts = pd.DataFrame(counts(col))
    df_ref_counts= pd.DataFrame(check)
    return assert_frame_equal(df_counts,df_ref_counts)


c = input("Column Name :")
test_counts(c) 

# After running block, use the Correct column name in the blank cell
#i.e. as we're verifying the 'Stressed' variable, type Stressed in the empty cell below and PRESS ENTER

Column Name :Stressed


#INTERRUPT KERNAL ONLY IF REQUIRED i.e. IF INCORRECT COLUMN NAME IS ENTERED


#Impact on Financial Status categories
counts("Impact on Financial Status")

Less Than Usual    48
Same as Usual      42
More Than Usual    12
Name: Impact on Financial Status, dtype: int64


#Impact on Communication categories
counts("Impact on Communication")

More Than Usual    35
Less Than Usual    32
Same as Usual      35
Name: Impact on Communication, dtype: int64


#Unhappy or Depressed categories
df_new['Unhappy_or_Depressed'].value_counts()

Sometimes        44
Often            27
Seldom           24
Almost Always     7
Name: Unhappy_or_Depressed, dtype: int64


#Study or Work Satisfaction categories
df_new['Study or Work Satisfaction'].value_counts()

Neutral             39
Unsatisfied         35
Satisfied           22
Very Unsatisfied     6
Name: Study or Work Satisfaction, dtype: int64


#checking the Region categories
df_new['Region'].value_counts()

Europe           26
Africa           19
North America    18
Asia             17
South America    13
Oceania           9
Name: Region, dtype: int64


#State of Mind Category
df_new['SOM_Affected'].value_counts()

Yes    73
No     29
Name: SOM_Affected, dtype: int64


#define a variable for 'yes' respondents
said_yes = df[df['SOM_Affected'] == 'Yes']
said_yes.shape

(73, 18)


#The number of people who answered ‘Yes’ to the State of Mind question can be verfied by running a quick 
#value_counts() on this dataframe:

counts = df['SOM_Affected'].value_counts(sort=False)#Preserve order of ordinal variable
counts

Yes    73
No     29
Name: SOM_Affected, dtype: int64


#Bar chart - State of Mind Affected with Covid 19 Lockdown
#count categories but do not sort by frequency

counts=df['SOM_Affected'].value_counts(sort=False)

sns.set(font_scale=1.0)
plt.figure(figsize=(10, 7),dpi=80)

plt.tick_params(labelsize=15);
plt.suptitle('State of Mind (SOM) Affected with Covid 19 Lockdown', fontsize=18)

ax=counts.plot.barh()
ax.set_xlabel('Frequency', fontsize=16)
ax.set_ylabel('Yes or No',fontsize=16)
plt.show()


#As we can see from the barchart above, and descriptive statistics below:
#Approximately 72% of the survey respondents stated the Covid 19 Lockdown affected their state of mind.

df['SOM_Affected'].describe()

count     102
unique      2
top       Yes
freq       73
Name: SOM_Affected, dtype: object


#Normalised Stack Barchart - State of Mind(SOM) During the Covid 19 Lockdown

#This will adjust values measured on different scales to a common scale.
#Will also calculate percentages.

#*************************************************************#
#From those that stated yes, which represent males or females?#
#*************************************************************#

"""From the chart below, it's clear that Males suffered the most, with respect to State of Mind, 
during the Covid 19 lockdown."""

# Remove all 'prefer not to say' responses. None in our case
#df = df.loc[df['gender'] != 'prefer not to say']

# Generate normalised cross tabulation.
table = pd.crosstab(df['SOM_Affected'], df['Gender'], normalize='index')
# Sort by female responses and convert to percentage.
table = table.sort_values(by='Female', ascending=False) * 100

sns.set(font_scale=1.0)
plt.figure(figsize=(16,8),dpi=80)

ax = table.plot.barh(stacked=True)
ax.invert_yaxis()
ax.axvline(50, color='grey', linestyle='dashed', linewidth=1)

plt.tick_params(labelsize=15);
plt.suptitle('Percentage of State of Mind (SOM) Affected\nDuring the Covid 19 Lockdown', fontsize=18)


ax.set_xlabel('Percent', fontsize=16)
plt.ylabel('Did Covid 19 Lockdown\nAffect Our Repondents SOM?', multialignment='center', fontsize=16)

# Place legent outside plot axes.
ax.legend(bbox_to_anchor=(1.0, 1.0))

plt.show()

#*Reference 6 (refer to the foot of this notebook) was used to summarize the above.

<Figure size 1280x640 with 0 Axes>


#Cross Tabulation Table - Frequency of SOM Affected in Different Regions of the world

#The crosstab function builds a cross tabulation table that can portray the frequency with which different groups
#of data appear. It is normalized

table = pd.crosstab(df['Region'], df['SOM_Affected'], normalize='index')
# Sort by female responses and convert to percentage.
table = table.sort_values(by='Yes', ascending=False) * 100
table


#Normalised Stack Barchart - Percentage of SOM Affected by REGION

#*************************************************************************************#
#Did some regions of the world suffer more than others, with respect to state of mind?#
#*************************************************************************************#

"""It's clear from the chart below and table above, that the African nation's state of mind was affected the most 
during the Covid 19 lockdown.
The European nation's state of mind was the least affected.
Deeper research is required on why this is and what factors came into play."""

#Remove all 'prefer not to say' responses. Not applicable here
#df = df.loc[df['gender'] != 'prefer not to say']

# Generate normalised cross tabulation.
table = pd.crosstab(df['Region'], df['SOM_Affected'], normalize='index')
# Sort by female responses and convert to percentage.
table = table.sort_values(by='Yes', ascending=False) * 100

sns.set(font_scale=1.0)
plt.figure(figsize=(16, 8),dpi=80)

ax = table.plot.barh(stacked=True)
ax.invert_yaxis()
ax.axvline(50, color='grey', linestyle='dashed', linewidth=1)

plt.tick_params(labelsize=15);
plt.suptitle('Percentage of State of Mind (SOM)\n Affected by Region', fontsize=18)

ax.set_xlabel('Percent', fontsize=16)
ax.set_ylabel('Regions of The World', fontsize=16)
# Place legent outside plot axes.
ax.legend(bbox_to_anchor=(1.0, 1.0))
plt.show()

<Figure size 1280x640 with 0 Axes>


table = pd.crosstab(df['Age'], df['SOM_Affected'], normalize='index')
# Sort by female responses and convert to percentage.
table = table.sort_values(by='Yes', ascending=False) * 100
table


#Normalised Stack Barchart - Percentage of SOM Affected by AGE

#*****************************************************************************#
#Did certain age groups suffer more than others, with respect to state of mind?
#*****************************************************************************#

"""It's clear from the chart below and table above, there is a pattern. As the age bracket increases, our 
respondent's state of mind is less affected. Hence, younger people suffered the most and older people suffered the
least during the Covid 19 lockdown.
Perhaps there's more resiliance with the older generation, e.g experience/ handling of traumatic situations."""

#Remove all 'prefer not to say' responses. Not applicable here
#df = df.loc[df['gender'] != 'prefer not to say']

# Generate normalised cross tabulation.
table = pd.crosstab(df['Age'], df['SOM_Affected'], normalize='index')
# Sort by female responses and convert to percentage.
table = table.sort_values(by='Yes', ascending=False) * 100

sns.set(font_scale=1.0)
plt.figure(figsize=(18, 8), dpi=80)

ax = table.plot.barh(stacked=True)
ax.invert_yaxis()
ax.axvline(50, color='grey', linestyle='dashed', linewidth=1)

plt.tick_params(labelsize=15);
plt.suptitle('Percentage of State of Mind (SOM)\nAffected by Age', fontsize=18)

ax.set_xlabel('Percent', fontsize=16)
ax.set_ylabel('Age', fontsize=16)

# Place legent outside plot axes.
ax.legend(bbox_to_anchor=(1.0, 1.0))
plt.show()

<Figure size 1440x640 with 0 Axes>


#Count categories but do not sort by frequency
counts = df['Hours Spent Outside'].value_counts(sort=False)#Preserve order of ordinal variable
counts

0-1 hours per day            22
2-3 hours per day            32
4-5 hours per day            38
6-8 hours per day             8
More than 8 hours per day     2
Name: Hours Spent Outside, dtype: int64


#count categories but do not sort by frequency

counts=df['Hours Spent Outside'].value_counts(sort=False)

sns.set(font_scale=1.0)
plt.figure(figsize=(10, 7),dpi=80)

plt.tick_params(labelsize=15);
plt.suptitle('How Much Fresh Air Our Respondents Had\nDuring the Covid 19 Lockdown', fontsize=18)

ax=counts.plot.barh()
ax.set_xlabel('Frequency', fontsize=16)
ax.set_ylabel('Hours Spent Outside', fontsize=16)
plt.show()


#As we can see from the barchart above and descriptive ststistics below:
#The mode or highest frequency is 4-5 hours per day.
#This was the most popular time band spent outside during the Covid 19 lockdown

df['Hours Spent Outside'].describe()

count                   102
unique                    5
top       4-5 hours per day
freq                     38
Name: Hours Spent Outside, dtype: object


#Lets tabulate this data above in a coherant fashion
pd.crosstab(df['Hours Spent Outside'], df['Gender'], margins=True)


##df6 = df['Hours Spent Outside'].mode()[0]
##modee_pos = counts.index.get_loc(df6)


#Plot the counts as a bar chart
#ax=counts.plot.bar(rot=90)

#plt.tick_params(labelsize=15);
    
#ax.set_title('How Much Fresh Air During the Covid 19 Lockdown',fontsize=18)
#ax.set_xlabel('Frequency',fontsize=16)
#ax.set_ylabel('Hours Spent Outside',fontsize=16)

#Format counts as integers
#ax.yaxis.set_major_formatter(mpl.ticker.EngFormatter(places=0)
                             
#Find the mode so we can label it on the plot
#df6 = df['Hours Spent Outside'].mode()
#df6
                             
#Find the index of the mode in the plot
#modee_pos = counts.index.get_loc(df6)
#try:
    #ax.annotate('mode={}'.format(df6), xy=(modee_pos +0.25, 14),
            #xytext=(modee_pos + 4 + 0.7, 15),
            #arrowprops=dict(facecolor='black', shrink=0.05))
    #plt.show()
#except:
    #print("Plot is not identified")


#********************************************************************#
#Who did our respondents live with during the Covid 19 Lockdown?
#********************************************************************#

"""From the value counts below, we can say that the majority (51) of "Male and Female" repondents are sharing 
   their home with other family members. Only 15 respondents lived alone during the Covid 19 Lockdown 
   (least amount of respondents)."""

'From the value counts below, we can say that the majority (51) of "Male and Female" repondents are sharing \n   their home with other family members. Only 15 respondents lived alone during the Covid 19 Lockdown \n   (least amount of respondents).'


#1. Split strings into separate columns and create a new dataframe - Q5

home_sharing = df_new['Home Sharing'].str.split(',', expand=True)
pd.DataFrame(home_sharing)


#2. Stack columns into a single variable

home_sharing = home_sharing.stack()             #This creates a multi-index
#home_sharing = 'home sharing'                   #Name the derived variable  #Set to categorical
home_sharing.index.names ='id','option'     #Name multi-index dimensions
df15 = pd.DataFrame(home_sharing)
df15


#Joining checkbox answers with other variables i.e. Gender

df2=pd.read_csv('mental-health-survey.csv', usecols=[2,5])
df2.columns=['Gender', 'Home Sharing']
df2.index.name='id'
df2


#We want to check value counts for each choice in Home Sharing
#First create a new empty dataframe

df_checkbox = pd.DataFrame(columns = ['Share'])
print(df_checkbox)

Empty DataFrame
Columns: [Share]
Index: []


#Create a for loop to iterate the df15 dataframe from step 2
ar = []
for i in range(len(df15)):
    ar.append(df15.iloc[i,0])
    
    
for l in range(len(ar)):
    df_checkbox = df_checkbox._append({'Share' :ar[l]},
                       ignore_index = True)


#Now display counts for each choice in Home Sharing
df_checkbox['Share'].value_counts()

Other Family Memebers         51
Your child/ children          31
Your partner                  28
One or multiple housemates    25
No one                        15
 I live alone                 15
Name: Share, dtype: int64


unhappy_data = df_new['Unhappy_or_Depressed'].value_counts()
unhappy_df=pd.DataFrame(unhappy_data)
unhappy_df


Anxious_data = df_new['Anxious or Low Confidence'].value_counts()
Anxious_df=pd.DataFrame(Anxious_data)
Anxious_df


Stressed_data = df_new['Stressed'].value_counts()
Stressed_df = pd.DataFrame(Stressed_data)
Stressed_df


df38 = pd.DataFrame(columns = ['Question','Sometimes','Often','Seldom','Almost_Always','Never','Gender'])
print(df38)

Empty DataFrame
Columns: [Question, Sometimes, Often, Seldom, Almost_Always, Never, Gender]
Index: []


try:
    df38 = df38.append({'Question':"How often did you feel unhappy/ depressed",'Sometimes':unhappy_df.iloc[0,0], 'Often' :unhappy_df.iloc[1,0],'Seldom':unhappy_df.iloc[2,0], 'Almost_Always' :unhappy_df.iloc[3,0],'Never' :0,'Gender':'Gender'},
                   ignore_index = True)
    df38= df38.append({'Question':"How often did you feel under stress",'Sometimes':Stressed_df.iloc[0,0], 'Often' :Stressed_df.iloc[1,0],'Seldom':Stressed_df.iloc[2,0], 'Almost_Always' :Stressed_df.iloc[3,0],'Never':0,'Gender':'Gender'},
                   ignore_index = True)
    df38 =df38.append({'Question':"How often did you feel anxious/ a loss of confidence",'Sometimes':Anxious_df.iloc[0,0], 'Often' :Anxious_df.iloc[1,0],'Seldom':Anxious_df.iloc[2,0],'Almost_Always' :Anxious_df.iloc[3,0],'Never' :Anxious_df.iloc[4,0],'Gender':'Gender'},
                   ignore_index = True)
except:
    print("Please add appropriate column name and value")


#Print dataframe with all three sub questions and associated options
df38


#************************************#
#How often did our respondents:#
#1. Feel unhappy or depressed?#
#2. Feel under stress#
#3. Feel anxious/ loss of confidence#
#************************************#

"""From the DataFrame above and Likert Chart below, we can infer that most of our respondents felt:
unhappy/depressed, under stress and a loss of confidence only 'Sometimes'. 'Often' was the next highest frequency
count for our respondents. Only a very small minority 'never' experienced any of these symptoms """


wrapper = textwrap.TextWrapper(width=50)
df38['Question'] = df38['Question'].apply(lambda x: wrapper.fill(x))


colors = ['white', 'firebrick','lightcoral','gainsboro','cornflowerblue', 'darkblue']
df38 = df38.set_index('Question')



middle = df38[["Sometimes", "Often"]].sum(axis=1) + df38["Seldom"]*.1
longest = middle.max()*1.1


df38.insert(0, '', (middle - longest).abs())
ax = df38[df38['Gender']=='Gender'].dropna(axis=1).plot.barh(
    stacked=True, 
    color=colors,
    figsize=(10,7),
    edgecolor='none'
    
)


z = plt.axvline(longest, linestyle='--', color='black', alpha=.5)
z.set_zorder(-1)


comp_long = int(df38.sum(axis=1).max()*1.05) 
plt.xlim(0, comp_long)



xvalues = [longest+50-50*i for i in range(4)]
xlabels = ['{:4.0f}'.format(x-longest) for x in xvalues]
plt.xticks(xvalues, xlabels)


box = ax.get_position()
ax.set_position([box.x0, box.y0 + box.height * 0.1,
                 box.width, box.height * 0.8])

# create the legend
handler, label = ax.get_legend_handles_labels()
ax.legend(
    handler, 
    label, 
    loc='upper center', 
    bbox_to_anchor=(.5, -0.2), 
    ncol=5, 
    edgecolor='white'
)

# set labels and title
plt.xlabel('Count',fontsize= 15)
plt.ylabel('',fontsize= 15)
plt.title('How often did you feel:',fontsize= 18)
plt.show()


#Finding the number of categories

sometimes = df_new['Unhappy_or_Depressed'] == 'Sometimes'
often = df_new['Unhappy_or_Depressed'] == 'Often'
seldom = df_new['Unhappy_or_Depressed'] == 'Seldom'
AlmostAlways = df_new['Unhappy_or_Depressed'] == 'Almost Always'


#number of people according to the Unhappy or Depressed categories
#agee = df_new['Unhappy or Depressed'].replace(unhappy)
#we're unable to change categorical to numerical data. hence, we assign a numerical scale to our likert data

age = {'Under 18': 0, '18-24': 1, '25-34':2, '35-44':3, '45-54':4, '55-64':5}
ageCol = df_new['Age'].replace(age)
ageCol.value_counts()

3    35
2    32
4    20
1     9
0     3
5     3
Name: Age, dtype: int64


#Finding frequency
unCount1 = ageCol[sometimes].dropna()
unCount2 = ageCol[often].dropna()
unCount3 = ageCol[seldom].dropna()
unCount4 = ageCol[AlmostAlways].dropna()


#Finding Frequency
unFreq1 = unCount1.value_counts()
unFreq2 = unCount2.value_counts()
unFreq3 = unCount3.value_counts()
unFreq4 = unCount4.value_counts()


#*************************************************#
#How unhappy or depressed are different age groups?
#*************************************************#

"""This chart below and value counts above highlight that there's a relationship with unhappiness or depression,
in relation to age.  Apart from age group 2(25-34 year olds), there is a decline in unhappiness/ depression as 
our respondent's age increases.  
   It also highlights that our respondents that are unhappy, feel depressed 'sometimes', but not 'almost always'.
   The majority of the older three age groups (3,4,and 5) 'seldom' (not often) felt unhappy or depressed during
   the Covid 19 lockdown.
"""
plotdata = pd.DataFrame({'Sometimes': unFreq1, 'Often': unFreq2, 'Seldom': unFreq3, 'Almost Always': unFreq4})

plt.figure(figsize=(13, 9))

ax = plotdata.plot(kind='bar', stacked=True, figsize=(10,7),fontsize=18)
ax.set_xlabel('Age Groups:\n(0=Under 18, 1=18-24, 2=25-34, 3=35-44, 4=45-54, 5=55-64)',fontsize=16)
ax.set_ylabel('Number of Respondents Unhappy',fontsize=16)
plt.suptitle('Unhappiness or Depression Among Different Age Groups', fontsize=18)

Text(0.5, 0.98, 'Unhappiness or Depression Among Different Age Groups')

<Figure size 1300x900 with 0 Axes>


#Encoding

unhappy = {'Sometimes': 0, 'Often': 1, 'Seldom': 2, 'Almost Always':3}


male = df_new['Gender'] == 'Male'
female = df_new['Gender'] == 'Female'


#Frequency of Unhappy or Depressed
#Value counts

un = df_new['Unhappy_or_Depressed'].replace(unhappy)
un.value_counts()

0    44
1    27
2    24
3     7
Name: Unhappy_or_Depressed, dtype: int64


#for plotting
unCount1 = un[male].dropna()
unCount2 = un[female].dropna()


#for plotting
unFreq1 = unCount1.value_counts()
unFreq2 = unCount2.value_counts()


#****************************************************#
#How unhappy or depressed are different gender groups?
#****************************************************#

"""This chart below and value counts above highlight, that most of our respondents are feeling unhappy/ depressed 
only 'Sometimes'. More males appear to be 'Often' unhappy or depressed, compared to their female counterparts. 
The minority of our respondents are 'Almost always' unhappy or depressed, with slighly more females than males """

plotdata = pd.DataFrame({'Male': unFreq1, 'Female': unFreq2})
ax = plotdata.plot(kind='barh', stacked=True, fontsize=20, figsize=(10,7))
ax.set_xlabel('Number of Respondents Based on Gender',fontsize=16)
ax.set_ylabel('Unhappy or Depressed:\n(Sometimes: 0, Often: 1, Seldom: 2, Almost Always:3)',fontsize=16)
plt.suptitle('Unhappiness or Depression Level Among Different Genders', fontsize=18)

Text(0.5, 0.98, 'Unhappiness or Depression Level Among Different Genders')


#Encoding the categorical data
#We're unable to change categorical to numerical data. Hence, we assign a numerical scale to our likert data

ImpactCommunication = {'Same as Usual': 0, 'More Than Usual': 1, 'Less Than Usual': 2}


#value counts
#Replacing the actual values with encoded values

impact = df_new['Impact on Communication'].replace(ImpactCommunication)
impact.value_counts()

1    35
0    35
2    32
Name: Impact on Communication, dtype: int64


#number of Gender category
male = df_new['Gender'] == 'Male'
female = df_new['Gender'] == 'Female'


#For plotting
impactScores1 = impact[male].dropna()
impactScores2 = impact[female].dropna()


#For plotting
impactFreq1 = impactScores1.value_counts()
impactFreq2 = impactScores2.value_counts()


#Frequency table
plotdata = pd.DataFrame({'Male': impactFreq1, 'Female': impactFreq2})
plotdata.head()


#****************************************************#
#What is the impact on communication - Based on gender?
#****************************************************#

"""
This chart below and value counts above highlight that our respondents are stating the impact on communication 
remains the same (35 respondents), which is true.  Our communication is still on going, thanks to the Internet.

However, not far behind, 32 respondents state they have less communication with family and friends.
There is a faily even balance between male and females, in all 3 types of responses.

"""

ax = plotdata.plot(kind='barh', stacked=True, fontsize=16, figsize=(10,7))
ax.set_xlabel('Number of Respondents Based on Gender',fontsize=16)
ax.set_ylabel("Impact on Communication:\n(Same as Usual: 0, More Than Usual: 1, Less Than Usual: 2",fontsize=16)
plt.suptitle('Impact on Communication Level on Different Genders', fontsize=18)
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))#insert legend box, outside chart

<matplotlib.legend.Legend at 0x7ff611c7d670>


#number of Age category
age18 = df_new['Age'] == 'Under 18'
age18_24 = df_new['Age'] == '18-24'
age25_34 = df_new['Age'] == '25-34'
age35_44 = df_new['Age'] == '35-44'
age45_54 = df_new['Age'] == '45-54'
age55_64 = df_new['Age'] == '55-64'


#For plotting
impactScores1 = impact[age18].dropna()
impactScores2 = impact[age18_24].dropna()
impactScores3 = impact[age25_34].dropna()
impactScores4 = impact[age35_44].dropna()
impactScores5 = impact[age45_54].dropna()
impactScores6 = impact[age55_64].dropna()


#For plotting
impactFreq1 = impactScores1.value_counts()
impactFreq2 = impactScores2.value_counts()
impactFreq3 = impactScores3.value_counts()
impactFreq4 = impactScores4.value_counts()
impactFreq5 = impactScores5.value_counts()
impactFreq6 = impactScores6.value_counts()


#*****************************************#
#Impact on Communication - Based on age group
#*****************************************#

"""This chart below highlights that our repondents that are aged between 25 and 34 felt the impact, of lack of 
communication the most. However, our repondents that are aged between 35 and 44 highlighted they experienced more 
communication during the Covid 19 lockdown.
"""
plotdata = pd.DataFrame({'Under 18': impactFreq1, '18-24': impactFreq2, '25-34':impactFreq3, '35-44':impactFreq4, '45-54':impactFreq5, '55-64':impactFreq6})
ax = plotdata.plot(kind='barh', stacked=True, fontsize=16, figsize=(10,7))
ax.set_xlabel('Number of Respondents Based on Age Groups',fontsize=16)
ax.set_ylabel("Impact on Communication:\n(Same as Usual: 0, More Than Usual: 1, Less Than Usual: 2",fontsize=16)
plt.suptitle('Impact on Communication Level Among Different Age Groups', fontsize=18)
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))#insert legend box, outside chart

#*Reference 7 (refer to the foot of this notebook) was used to summarize the above.

<matplotlib.legend.Legend at 0x7ff611d7d8e0>


#Encoding
financial = {'Less Than Usual': 0, 'Same as Usual': 1, 'More Than Usual': 2}


#value counts
#Replacing the actual values with encoded values

fin = df_new['Impact on Financial Status'].replace(financial)
fin.value_counts()

0    48
1    42
2    12
Name: Impact on Financial Status, dtype: int64


#For plotting
finCount1 = fin[male].dropna()
finCount2 = fin[female].dropna()


#For plotting
finFreq1 = finCount1.value_counts()
finFreq2 = finCount2.value_counts()


#******************************************#
#Impact on financial status - Based on gender
#******************************************#

"""This chart below and value counts above highlight that the impact on financial status is very significant. 
   More respondents (48 repondents) are stating that their financial status has declined (i.e. they now earn 'less 
   than usual'. This reflects the people who lost their jobs or were furloughed during the Covid 19 lockdown.
   The chart also highlights there are more males than females in this predicament.
"""
plotdata = pd.DataFrame({'Male': finFreq1, 'Female': finFreq2})
ax = plotdata.plot(kind='barh', stacked=True, fontsize=16, figsize=(10,7))
ax.set_xlabel('Number of Repondents Based on Gender',fontsize=16)
ax.set_ylabel('Impact on Financial Status:\n(Less Than Usual: 0, Same as Usual: 1, More Than Usual: 2)',fontsize=16)
plt.suptitle('Impact on Financial Status Level on Different Genders', fontsize=18)

Text(0.5, 0.98, 'Impact on Financial Status Level on Different Genders')


#Encoding
study = {'Neutral': 0, 'Unsatisfied': 1, 'Satisfied': 2, 'Very Unsatisfied':3}


#Frequency of Study or Work Satisfaction 
stud = df_new['Study or Work Satisfaction'].replace(study)
stud.value_counts()

0    39
1    35
2    22
3     6
Name: Study or Work Satisfaction, dtype: int64


#For plotting
studyCount1 = stud[male].dropna()
studyCount2 = stud[female].dropna()


#For plotting
studyFreq1 = studyCount1.value_counts()
studyFreq2 = studyCount2.value_counts()


#********************************************#
#Study or Work Satisfaction - Based on Gender#
#********************************************#

"""Most of our respondent's stance was neutral (39 counts), based on study or work satisfaction during the 
   Covid 19 lockdown. However, not far behind, 35 respondents stated they were unsatisfied with their study/ work 
   during this lockdown. The least amount of respondents (6 counts) stated they were 'Very unsatisfied'.
   There is a faily even balance between males and females, in all 4 types of responses.
   

"""

plotdata = pd.DataFrame({'Male': studyFreq1, 'Female': studyFreq2})
ax=plotdata.plot(kind='barh', stacked=True, fontsize=16, figsize=(10,7))
ax.set_xlabel('Number of Respondents Based on Gender', fontsize=16)
ax.set_ylabel('Study or Work Satisfaction:\n(Neutral: 0, Unsatisfied: 1, Satisfied: 2, Very Unsatisfied:3)',fontsize=16)
plt.suptitle('Study or Work Satisfaction Among Different Genders', fontsize=18)

Text(0.5, 0.98, 'Study or Work Satisfaction Among Different Genders')


#number of Age category
age18 = df_new['Age'] == 'Under 18'
age18_24 = df_new['Age'] == '18-24'
age25_34 = df_new['Age'] == '25-34'
age35_44 = df_new['Age'] == '35-44'
age45_54 = df_new['Age'] == '45-54'
age55_64 = df_new['Age'] == '55-64'


#For plotting
impactScores1 = impact[age18].dropna()
impactScores2 = impact[age18_24].dropna()
impactScores3 = impact[age25_34].dropna()
impactScores4 = impact[age35_44].dropna()
impactScores5 = impact[age45_54].dropna()
impactScores6 = impact[age55_64].dropna()


#For plotting
impactFreq1 = impactScores1.value_counts()
impactFreq2 = impactScores2.value_counts()
impactFreq3 = impactScores3.value_counts()
impactFreq4 = impactScores4.value_counts()
impactFreq5 = impactScores5.value_counts()
impactFreq6 = impactScores6.value_counts()


#For plotting
finScores1 = fin[age18].dropna()
finScores2 = fin[age18_24].dropna()
finScores3 = fin[age25_34].dropna()
finScores4 = fin[age35_44].dropna()
finScores5 = fin[age45_54].dropna()
finScores6 = fin[age55_64].dropna()


#For plotting
finFreq1 = finScores1.value_counts()
finFreq2 = finScores2.value_counts()
finFreq3 = finScores3.value_counts()
finFreq4 = finScores4.value_counts()
finFreq5 = finScores5.value_counts()
finFreq6 = finScores6.value_counts()


#************************************************#
#Impact on financial status - Based on age groups#
#************************************************#


"""This chart below highlights that most of our respondents who are between the age of 18 to 54 faced
   the brunt of the impact on their financial status i.e. earned 'Less than usual' during the Covid 19 lockdown
   There is also a small minority in the 25-54 age band who earned 'More than usual' during the Covid
   19 lockdown.  Deeper research is required to reveal futher insights.
   
"""
plotdata = pd.DataFrame({'Under 18': finFreq1, '18-24': finFreq2, '25-34': finFreq3, '35-44': finFreq4, '45-54': finFreq5, '55-64': finFreq6})
ax = plotdata.plot(kind='barh', stacked=True, fontsize=16, figsize=(10,7))
ax.set_xlabel('Number of Respondents Based on Age Groups',fontsize=16)
ax.set_ylabel('Impact on Financial Status:\n(Less Than Usual: 0, Same as Usual: 1, More Than Usual: 2)',fontsize=16)
plt.suptitle('Impact on Financial Status Among Different Age Groups', fontsize=18)

Text(0.5, 0.98, 'Impact on Financial Status Among Different Age Groups')


#For plotting
unScores1 = un[age18].dropna()
unScores2 = un[age18_24].dropna()
unScores3 = un[age25_34].dropna()
unScores4 = un[age35_44].dropna()
unScores5 = un[age45_54].dropna()
unScores6 = un[age55_64].dropna()


#For plotting
unFreq1 = unScores1.value_counts()
unFreq2 = unScores2.value_counts()
unFreq3 = unScores3.value_counts()
unFreq4 = unScores4.value_counts()
unFreq5 = unScores5.value_counts()
unFreq6 = unScores6.value_counts()


#Age Group frequency
df_new['Age'].value_counts()

35-44       35
25-34       32
45-54       20
18-24        9
Under 18     3
55-64        3
Name: Age, dtype: int64


#Encoding
age = {'Under 18': 0, '18-24': 1, '25-34':2, '35-44':3, '45-54':4, '55-64':5}


#Replacing with encoded values
ageCol = df_new['Age'].replace(age)


ageCol

0      3
1      3
2      3
3      3
4      0
      ..
97     0
98     2
99     2
100    3
101    2
Name: Age, Length: 102, dtype: int64


import seaborn as sns


#***************************************************#
#Boxplot based on Unhappy or Depressed and Age Group
#***************************************************#

"""
This boxtplot below highlights that our respondents who are unhappy or depressed, mostly range between the age of 
18 to 64 
"""

fig, ax = plt.subplots(figsize=(18, 14), dpi = 150)
sns.set_style('darkgrid')

sns.boxplot(x='Unhappy_or_Depressed',y=ageCol,data=df_new, ax=ax).set_title('Unhappy or Depressed Boxplot - Based on Age Groups',fontsize=20)


plt.ylabel('Age:\n(0=Under 18, 1=18-24, 2=25-34, 3=35-44, 4=45-54, 5=55-64)', fontsize = 20)
plt.xlabel('Unhappy or Depressed', fontsize = 20)
plt.xticks(rotation = 360, fontsize = 16)
plt.yticks(rotation = 0, fontsize = 16)

plt.show()


#Replacing with the encoded values
df_new['Age'] = df_new['Age'].replace(age)


#*********************************************************#
#Boxplot based on Study or Work Satisfaction and Age groups
#*********************************************************#

"""This boxplot highlights that most of our repondents were 'Very unsatisfied' with their study or work, during 
    the Covid 19 lockdown. That is, age groups 0 to 4 (under 18 to 54 years old) inclusive.
    
    This leaves age group 5 (ages 55 to 64).  This age group appears to be the only unique maximum out of all four 
    options. Infering, the older generation may not get phased with traumatic experiences. It can be identified in
    the 'Satisifed' area of the boxplot.  
    We have also identified some outliers for this 'Satisfied' area of the boxplot, they are age groups 0-1 
    (under 18 to 24 years old) inclusive. In other words, it was rare for this younger generation to be satisfied
    with work/ study during the Covid 19 lockdown. """
    
   

fig, ax = plt.subplots(figsize=(18, 14), dpi = 150)
sns.set_style('darkgrid')

sns.boxplot(x='Study or Work Satisfaction',y=ageCol,data=df_new, ax=ax).set_title('Study or Work Satisfaction Boxplot Based on Different Age Groups',fontsize=20)


plt.ylabel('Age:\n(0=Under 18, 1=18-24, 2=25-34, 3=35-44, 4=45-54, 5=55-64)', fontsize = 20)
plt.xlabel('Study or Work Satisfaction', fontsize = 20)
plt.xticks(rotation = 360, fontsize = 16)
plt.yticks(rotation = 0, fontsize = 16)

plt.show()


#*******************************************************
#Boxplot based on Impact on Communication and Age groups
#*******************************************************

"""This boxplot highlights that our respondents who are aged between, under 18 and 54, had a 'More than usual' 
   impact on their communication. In other words, they experienced more communication during the Covid 19 lockdown.  
   The median age group for this 'More than usual' selection was 35-44 years old (group 3).
   The minimum/ 1st Quartile are age groups 0 to 2 (under 18 to 34) inclusive.
   The maximum/ 3rd Quartile are age groups 3.5 to 4 (estimated age 40 to 54 ) inclusive"""


fig, ax = plt.subplots(figsize=(18, 14), dpi = 150)
sns.set_style('darkgrid')

sns.boxplot(x='Impact on Communication',y=ageCol,data=df_new, ax=ax).set_title('Impact on Communication Boxplot Based on Different Age Groups',fontsize=20)


plt.ylabel('Age:\n(0=Under 18, 1=18-24, 2=25-34, 3=35-44, 4=45-54, 5=55-64)', fontsize = 20)
plt.xlabel('Impact on Communication', fontsize = 20)
plt.xticks(rotation = 360, fontsize = 16)
plt.yticks(rotation = 0, fontsize = 16)

plt.show()


#**********************************************************
#Boxplot based on Impact on Financial Status and Age groups
#**********************************************************

"""This boxplot below highlights that there are a lot of respondents beween the age of 25 to 35 whose financial 
status declined.
"""
fig, ax = plt.subplots(figsize=(18, 14), dpi = 150)
sns.set_style('darkgrid')

sns.boxplot(x='Impact on Financial Status',y=ageCol,data=df_new, ax=ax).set_title('Impact on Financial Status Boxplot Based on Different Age Groups',fontsize=20)


plt.ylabel('Age:\n(0=Under 18, 1=18-24, 2=25-34, 3=35-44, 4=45-54, 5=55-64)', fontsize = 20)
plt.xlabel('Impact on Financial Status', fontsize = 20)
plt.xticks(rotation = 360, fontsize = 16)
plt.yticks(rotation = 0, fontsize = 16)

plt.show()


from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()


#label encoding the column that matters i.e. relevant key variables
df_new['Impact on Financial Status']= label_encoder.fit_transform(df_new['Impact on Financial Status'])
df_new['Gender']= label_encoder.fit_transform(df_new['Gender'])
df_new['Impact on Communication']= label_encoder.fit_transform(df_new['Impact on Communication'])
df_new['Unhappy_or_Depressed']= label_encoder.fit_transform(df_new['Unhappy_or_Depressed'])
df_new['Study or Work Satisfaction']= label_encoder.fit_transform(df_new['Study or Work Satisfaction'])


"""These Univariate and Multivariate analysis plots show that if we apply any algorithms, any linearly separable 
algorithm will be suffice. In other words, we can distinguish the data points easily and they are not
overlapped with each other. Then any linear line based algorithm such as Linear Regression or Logistic Regression 
would be enough for this dataset 
"""
pair = sns.pairplot(df_new)
pair.fig.suptitle("Univariate and Multivariate analysis", y=1.08)

Text(0.5, 1.08, 'Univariate and Multivariate analysis')


#Skewness is a measure of the symmetry in a distribution. A symmetrical data set will have a skewness equal to 0. 
#Hence, a normal distribution will have a skewness of 0.
#The objective of choosing features/columns with skewed features greater than 0.75 is because we're searching to 
#work with features that have a more normal like distribution, so this is why we apply a log transformation, so to
#normalize the distribution i.e.skewness=0.

#*Reference 8 (refer to the foot of this notebook) was used to summarize the above.


#This 'Impact on Financial Status' column is not skewed. 
fig, ax = plt.subplots(figsize=(13, 10))
sns.distplot(df_new['Impact on Financial Status'])
ax.set_title('Impact on Financial Status - Skewness', fontsize=20)

plt.xticks(rotation = 360, fontsize = 16)
plt.yticks(rotation = 0, fontsize = 16)
plt.ylabel('Density', fontsize = 20)
plt.xlabel('Impact on Financial Status', fontsize = 20)

plt.show()

Intel MKL WARNING: Support of Intel(R) Streaming SIMD Extensions 4.2 (Intel(R) SSE4.2) enabled only processors has been deprecated. Intel oneAPI Math Kernel Library 2025.0 will require Intel(R) Advanced Vector Extensions (Intel(R) AVX) instructions.


#This 'Impact on Communication'column is also not skewed
fig, ax = plt.subplots(figsize=(13, 10))
sns.distplot(df_new['Impact on Communication'])
ax.set_title('Impact on Communication - Skewness', fontsize=20)

plt.xticks(rotation = 360, fontsize = 16)
plt.yticks(rotation = 0, fontsize = 16)
plt.ylabel('Density', fontsize = 20)
plt.xlabel('Impact on Communication', fontsize = 20)

plt.show()


#This 'Unhappy or Depressed' column is a little bit negatively skewed

fig, ax = plt.subplots(figsize=(13, 10))
sns.distplot(df_new['Unhappy_or_Depressed'])
ax.set_title('Unhappy or Depressed - Skewness', fontsize=20)

plt.xticks(rotation = 360, fontsize = 16)
plt.yticks(rotation = 0, fontsize = 16)
plt.ylabel('Density', fontsize = 20)
plt.xlabel('Unhappy or Depressed', fontsize = 20)

plt.show()


#This 'Study or Work Satisfaction' column is a little bit positively skewed

fig, ax = plt.subplots(figsize=(13, 10))
sns.distplot(df_new['Study or Work Satisfaction'])
ax.set_title('Study or Work Satisfaction - Skewness', fontsize=20)

plt.xticks(rotation = 360, fontsize = 16)
plt.yticks(rotation = 0, fontsize = 16)
plt.ylabel('Density', fontsize = 20)
plt.xlabel('Study or Work Satisfaction', fontsize = 20)

plt.show()


#***************************************************************************************************************
#The skewness of the columns is less than 0.75. Hence, there is no requirement to handle the skewness. If there is 
#a skewness greater than 0.75, then we should handle that skewness by log transformation. However, in this case it
#is not necessary.
#Refer to skewness values for features/ columns below:
#***************************************************************************************************************

df_new.skew()

Age                          -0.201356
Gender                       -0.199981
Impact on Communication      -0.054436
Impact on Financial Status    0.118847
Unhappy_or_Depressed         -0.497372
Study or Work Satisfaction    0.224462
dtype: float64


#***************************************************************************************************************
#It appears that the columns are mesokurtic in nature because of their negative values. This is acceptable 
#because the kurtosis of a univariate normal distribution (i.e. bell curve) is three. 
#If a distribution’s kurtosis is less than three, then the distribution produces 
#fewer and less extreme outliers than the normal distribution. A kurtosis greater 
#than three highlights the distribution produces more outliers than the normal distribution.
#***************************************************************************************************************

#Ref: https://en.wikipedia.org/wiki/Kurtosis

df_new.kurtosis()

Age                          -0.073757
Gender                       -1.999612
Impact on Communication      -1.487382
Impact on Financial Status   -1.884858
Unhappy_or_Depressed         -1.030484
Study or Work Satisfaction   -1.284739
dtype: float64


#****************************************************************************************************************
#Since there isn't any significant skewness in the columns/ features, then there aren't any 'significant' outliers 
#More skewed distribution tends to have more outliers. Whereas, it is quite the opposite for less skewed
#distributions.
#****************************************************************************************************************


#**************************************************************************************************************
#There may be hidden and intricate relationships between the variables/ features in this dataset. Hence, we 
#require to compute and setup a correlation scores matrix, as can be seen below. Spearman's correlation was used 
#to summarize the linear relationship between any two data samples.

#It was important to focus on the key variables, from the likert scale questions.

#We opted for Spearman's Correlation over other correlations* i.e. Pearson Correlation for the following reasons: 
#Pearson correlation measures the linear relationship between two continuous variables and the Spearman 
#correlation measures the data sets based on their ranks for each variable. Spearman does not require assumption 
#of the relationship between variables, but it is always good to examine the relationship between variables. The 
#main difference between them is that Pearson is most appropriate for measurements from an interval scale and the 
#Spearman correlation is appropriate for measurements taken from ordinal scales - the majority of our data types.
#****************************************************************************************************************

#*Reference 9 (refer to the foot of this notebook) was used to summarize the above.


#Use key variables i.e. likert scale features
df_hm = df_new[['Age', 'Gender', 'Region', 'Impact on Communication', 'Impact on Financial Status', 'Unhappy_or_Depressed', 'Study or Work Satisfaction']].copy()


# 1) computing correlation scores
correlation_scores = df_hm.corr(method = 'spearman')

correlation_scores


#***************************************************************************************************************
#From the heatmap below or correlation scores above, we can determine some strong, positive  correlations.

#1. Age versus Unhappy_or_Depressed: There's a strong, positive correlation between these two variables. There is a 
#decline in unhappiness/ depression as our respondent's age increases i.e. the older our repondents are, the
#happier they were during the Covid 19 lockdown.

#2. Impact on Financial Status versus Unhappy_or_Depressed: There's a strong, positive correlation between these 
#two variables.  There is an increase in unhappiness as our repondent's financial status declines.

# correlation Heatmap
plt.figure(figsize = (10, 10), dpi = 200)
df_hm = sns.heatmap(correlation_scores, vmax = 0.9, linewidths = 0.5, cmap = "YlGnBu",
                annot = True, square = True)

bottom, top = df_hm.get_ylim()
df_hm.set_ylim(bottom + 0.5, top - 0.5)
plt.title('Correlation Heatmap')
plt.show()


df['Supported_Activities']=df['Supported_Activities'].astype('category')
df['Healthcare_Access']=df['Healthcare_Access'].astype('category')


#12
convert_cat_to_num = {"Social_Support": {"I don't think I need social support": 0, "No - I became unable to use the social support groups I was using before the pandemic": 1,"No - I think I need social support but cannot access them to begin with":2,"Yes - I started using social support groups after the start of the pandemic, and able to access them":3,"Yes - I was using social support groups before, and continue to do so":4}}
df = df.replace(convert_cat_to_num )
df.head()


df['Social_Support'].value_counts(sort=False)

0    57
1    10
2    22
3    11
4     2
Name: Social_Support, dtype: int64


#Plot bar chart for Social Support Needs

"""It's quite evident from the barchart and value counts below, that most (57) of our repondents thought 
   they didn't require social support during the Covid 19 lockdown. The gender split is very even and the majority
   (18 respondents) are in the 35-44 age band.
   The minority of our respondents (2) were using social groups before the Covid 19 lockdown, and continue to do 
   so. The gender split is all Males (2 respondents) and those 2 males are evenly split across the 25-34 and 35-44
   age groups. 
   """

counts=df['Social_Support'].value_counts(sort=False)

sns.set(font_scale=1.0)
plt.figure(figsize=(10, 7),dpi=80)

plt.tick_params(labelsize=15);
plt.suptitle('Social Support Needed or Not', fontsize=18)

ax=counts.plot.barh()
ax.set_xlabel('Frequency',fontsize=16)
ax.set_ylabel("Social Support",fontsize=16)
plt.show()


pd.crosstab(df['Social_Support'], df['Gender'], margins=True)


pd.crosstab(df['Social_Support'], df['Age'], margins=True)


convert_cat_to_num2 = {"Healthcare_Access": {"I didn't need access to healthcare during Covid 19": 0, "I couldn't find over the counter medication in pharmacies or shops": 1,"I couldn't get my prescription medication":2,"I needed a GP appointment and could only get a telephone or email one":3,"I needed a GP appointment and couldn't get one at all":4,"My regular hospital-based treatment was disrupted":5,"Planned procedures that I required, were cancelled":6}}
df = df.replace(convert_cat_to_num2)
df.head()


df['Healthcare_Access'].value_counts(sort=False)

1     8
2     6
0    54
3     4
4     6
5    13
6    11
Name: Healthcare_Access, dtype: int64


#*******************************************************************
#Was healthcare access required or not during the Covid 19 lockdown?
#*******************************************************************

"""Again, it is very evident from the barchart and value counts below, that most repondents (54) didn't require 
   access to healthcare during Covid 19 lockdown.  The gender split is very even and the majority
   (18 respondents) are in the 35-44 age band.
   Although, just under half of our respondents still wish access to healthcare. This can be highlighted by 
   the demand, by viewing the responses in categories 1 to 6 inclusive. 
   The minority of our respondents (4) are in category 3('I needed a GP appointment and could only get a 
   telephone or email one'), where there is an even split of males and females.  The ages span across from
   25 to 54 years old.
   """

counts=df['Healthcare_Access'].value_counts(sort=False)

sns.set(font_scale=1.0)
plt.figure(figsize=(10, 7),dpi=80)

plt.tick_params(labelsize=15);
plt.suptitle('Healthcare_Access Needed or Not', fontsize=18)


ax=counts.plot.barh()
ax.set_xlabel('Frequency',fontsize=16)
ax.set_ylabel('Healthcare_Access',fontsize=16)
plt.show()


pd.crosstab(df['Healthcare_Access'], df['Gender'], margins=True)


pd.crosstab(df['Healthcare_Access'], df['Age'], margins=True)


#Lets use clustering to answer this particular question
#Agglomerative clustering is a bottom-up clustering technique and will try to make clusters of data points that 
#are close to each other. And a dendrogram is used to visualize all clusters in a tree format in such a way that 
#all data points (called leaves) are visible and nicely arranged at the same depth. So what you get is data points
#clustered together as well as visibility of all points

#*******************************************************************************************************************
#Which of the supported activities mentioned in Q11 helped our respondents feel better, during the Covid 19 lockdown?
#*******************************************************************************************************************

"""We used a clustering algorithm to find similar data/ answer question 11. 
    In the bar chart below, we can infer that the counts of Cluster 3 is higher. The supported activities 
    associated with Cluster 3 consist of waching tv, exercise, doing remote work or study, housework and sometimes
    talk with family and friends - refer to dataframe below, for verification.
    Hence, we can infer that during Covid 19 lockdown most of our respondents participated in these aforementioned 
    supported activities, to help them feel better
    """

#Lets use clustering to 
proc_wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

def text_normalization(col):

    col = re.sub(r'[^a-zA-Z\s]', '', col, re.I|re.A)
    col = col.lower()
    col = col.strip()

    tokens_no = proc_wpt.tokenize(col)

    filter_tokens = [token for token in tokens_no if token not in stop_words]

    col = ' '.join(filter_tokens)
    return col

normalize = np.vectorize(text_normalization)

normalize_data = normalize(df['Supported_Activities'])
normalize_data

#*References 10 and 11 (refer to the foot of this notebook) was used to summarize the above.

array(['exercisewatching reading newstalking friends familydoing remote work study',
       'exercisewatching tv ie newsdoing remote work study',
       'exercisewatching tv ie newsdoing houseworkdoing remote work study',
       'exercisewatching reading newsdoing remote work study',
       'exercisewatching tv ie newsdoing houseworkplaying music',
       'watching reading newstalking friends family', 'remote work study',
       'playing music',
       'watching tv ie newswatching reading newsi didnt need help',
       'exercisedoing housework',
       'watching reading newsdoing remote work study',
       'talking friends family', 'didnt need help', 'didnt need help',
       'exercisewatching tv ie newstalking friends family',
       'watching reading newsdoing remote work study',
       'talking friends family',
       'talking friends familydoing remote work study',
       'talking friends family', 'watching reading news',
       'watching reading news',
       'watching reading newsdoing houseworktalking friends family',
       'didnt need help', 'housework',
       'watching reading newstalking friends family', 'didnt need help',
       'talking friends familydoing remote work study',
       'exercisewatching reading news', 'playing music',
       'didnt need help', 'watching tv ie newswatching reading news',
       'houseworkdoing remote work study', 'watching reading news',
       'didnt need help', 'exerciseplaying music', 'didnt need help',
       'talking friends familyplaying music',
       'exercisedoing remote work study',
       'exercisewatching reading newstalking friends family',
       'watching reading newsdoing housework',
       'watching tv ie newstalking friends family', 'didnt need help',
       'didnt need help',
       'talking friends familydoing remote work studyplaying music',
       'exercisewatching reading newsdoing housework',
       'watching reading newstalking friends family',
       'watching tv ie news',
       'watching tv ie newsdoing houseworkdoing remote work study',
       'playing music', 'remote work studyplaying music', 'exercise',
       'watching reading newstalking friends family',
       'talking friends family', 'houseworkdoing remote work study',
       'didnt need help', 'talking friends familyplaying music',
       'exercisedoing houseworkplaying musici didnt need help',
       'watching tv ie newstalking friends familydoing remote work studyplaying music',
       'exercisewatching reading newstalking friends family',
       'exercisewatching reading newsdoing remote work studyplaying music',
       'watching reading newstalking friends family',
       'talking friends familydoing remote work studyplaying music',
       'houseworktalking friends familydoing remote work studyplaying music',
       'watching reading newsdoing remote work study',
       'exercisewatching reading newstalking friends familydoing remote work study',
       'watching tv ie newstalking friends familydoing remote work study',
       'watching reading newstalking friends familydoing remote work studyi didnt need help',
       'exercisewatching tv ie newstalking friends family',
       'talking friends familyplaying music',
       'watching tv ie newstalking friends familydoing remote work study',
       'didnt need help', 'didnt need help',
       'talking friends familydoing remote work study',
       'watching tv ie newswatching reading newstalking friends familyplaying music',
       'watching tv ie newswatching reading newstalking friends familydoing remote work study',
       'talking friends familydoing remote work study',
       'watching tv ie newstalking friends familyplaying music',
       'exercisedoing houseworkdoing remote work studyplaying music',
       'watching tv ie newswatching reading newsdoing houseworkdoing remote work study',
       'exercisedoing houseworkdoing remote work study',
       'watching tv ie newsdoing remote work studyplaying music',
       'exercisewatching tv ie newswatching reading newsdoing remote work study',
       'watching tv ie news', 'watching reading newsplaying music',
       'watching reading newsdoing remote work study', 'didnt need help',
       'remote work study',
       'exercisewatching tv ie newstalking friends familydoing remote work study',
       'talking friends family',
       'watching tv ie newswatching reading news',
       'watching tv ie newswatching reading news', 'didnt need help',
       'exercisedoing housework',
       'exercisewatching reading newsdoing remote work study',
       'exercisewatching reading newstalking friends familydoing remote work study',
       'houseworktalking friends familydoing remote work studyplaying music',
       'exercisedoing houseworktalking friends familyplaying music',
       'watching reading newsdoing houseworktalking friends familydoing remote work study',
       'exercisewatching reading newsdoing houseworkplaying music',
       'watching tv ie newsdoing houseworktalking friends familydoing remote work study',
       'watching tv ie newswatching reading newsdoing remote work studyplaying music',
       'talking friends familydoing remote work studyplaying music'],
      dtype='<U85')


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


count_vector = CountVectorizer(min_df=0., max_df=1.)
count_vector_matrix = count_vector.fit_transform(normalize_data)
count_vector_matrix = count_vector_matrix.toarray()
count_vector_matrix

array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 1, 0, 1],
       [0, 0, 0, ..., 1, 0, 1],
       ...,
       [0, 0, 0, ..., 1, 1, 1],
       [0, 0, 0, ..., 1, 1, 1],
       [0, 0, 0, ..., 0, 0, 1]])


feature_names = count_vector.get_feature_names_out()
pd.DataFrame(count_vector_matrix, columns=feature_names)


vector = CountVectorizer(ngram_range=(2,2))
vector_matrix = vector.fit_transform(normalize_data)
vector_matrix = vector_matrix.toarray()
feature_names = vector.get_feature_names_out()
pd.DataFrame(vector_matrix, columns=feature_names)


from sklearn.feature_extraction.text import TfidfVectorizer
Tfid_v = TfidfVectorizer(min_df=0., max_df=1., use_idf=True)
Tfid_v_matrix = Tfid_v.fit_transform(normalize_data)
Tfid_v_matrix = Tfid_v_matrix.toarray()

feature_names = Tfid_v.get_feature_names_out()
pd.DataFrame(np.round(Tfid_v_matrix, 2), columns=feature_names)


similar = cosine_similarity(Tfid_v_matrix)
df_of_similar = pd.DataFrame(similar)
df_of_similar


#Plot dendrogram
#The number of clusters will be the number of vertical lines which are being intersected by the line drawn using 
#the threshold i.e. 3 clusters

#In a dendrogram, there are clades which indicate each branch, leaves which are the terminal end of each clade, 
#and outliers. Outliers are clades with only one leaf.

#Hence, there are no outliers in our Dendogram below



from scipy.cluster.hierarchy import dendrogram, linkage
Z = linkage(similar, 'ward')

plt.figure(figsize=(15,15))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Supported Activities')
plt.ylabel('Distance')
dendrogram(Z)
plt.axhline(y=14, c='k', ls='--', lw=10)

<matplotlib.lines.Line2D at 0x7ff5d01d8dc0>


#Dataframe with key variables; we're interested in 'Supported_Activities' associated with Cluster 3

from scipy.cluster.hierarchy import fcluster
distance = 8

clustering = fcluster(Z, distance, criterion='distance')
clustering = pd.DataFrame(clustering, columns=['Cluster'])
new_dataframe = pd.concat([df, clustering], axis=1)
new_dataframe


data = new_dataframe.groupby('Cluster').groups
data

{1: [8, 12, 13, 22, 25, 29, 33, 35, 41, 42, 54, 56, 66, 70, 71, 85, 91], 2: [17, 26, 43, 49, 61, 62, 72, 75, 77, 95, 101], 3: [0, 1, 2, 3, 6, 10, 15, 31, 37, 47, 53, 57, 59, 63, 64, 65, 69, 74, 78, 79, 80, 81, 84, 86, 87, 93, 94, 97, 99, 100], 4: [5, 11, 14, 16, 18, 21, 24, 38, 40, 45, 51, 52, 58, 60, 67, 73, 76, 88], 5: [19, 20, 27, 30, 32, 46, 82, 89, 90], 6: [4, 7, 9, 23, 28, 34, 36, 39, 44, 48, 50, 55, 68, 83, 92, 96, 98]}


#Cluster all the responses

ar1 = []
ar2 = []
ar3 = []
for x, y in data.items():
    for i in range(len(y)):
        
        ar1.append(x)
        ar2.append(y[i])
        result1 = new_dataframe['Supported_Activities'].loc[y[i]]
        ar3.append(result1)
        
list_tuples = list(zip(ar1, ar2, ar3))

df_with_cluster = pd.DataFrame(list_tuples,
                  columns = ['Cluster', 'Index','Supported_Activities']) 
     
# Print data
df_with_cluster.to_csv('survey009.csv')

print(df_with_cluster.to_string())

     Cluster  Index                                                                                            Supported_Activities
0          1      8                                     Watching TV i.e. not the news,Watching/ Reading the news,I didn't need help
1          1     12                                                                                              I didn't need help
2          1     13                                                                                              I didn't need help
3          1     22                                                                                              I didn't need help
4          1     25                                                                                              I didn't need help
5          1     29                                                                                              I didn't need help
6          1     33                                                                                              I didn't need help
7          1     35                                                                                              I didn't need help
8          1     41                                                                                              I didn't need help
9          1     42                                                                                              I didn't need help
10         1     54                                                                                              I didn't need help
11         1     56                                                 Doing exercise,Doing housework,Playing music,I didn't need help
12         1     66             Watching/ Reading the news,Talking to friends/ family,Doing remote work or study,I didn't need help
13         1     70                                                                                              I didn't need help
14         1     71                                                                                              I didn't need help
15         1     85                                                                                              I didn't need help
16         1     91                                                                                              I didn't need help
17         2     17                                                           Talking to friends/ family,Doing remote work or study
18         2     26                                                           Talking to friends/ family,Doing remote work or study
19         2     43                                             Talking to friends/ family,Doing remote work or study,Playing music
20         2     49                                                                        Doing remote work or study,Playing music
21         2     61                                             Talking to friends/ family,Doing remote work or study,Playing music
22         2     62                             Doing housework,Talking to friends/ family,Doing remote work or study,Playing music
23         2     72                                                           Talking to friends/ family,Doing remote work or study
24         2     75                                                           Talking to friends/ family,Doing remote work or study
25         2     77                                         Doing exercise,Doing housework,Doing remote work or study,Playing music
26         2     95                             Doing housework,Talking to friends/ family,Doing remote work or study,Playing music
27         2    101                                             Talking to friends/ family,Doing remote work or study,Playing music
28         3      0                 Doing exercise,Watching/ Reading the news,Talking to friends/ family,Doing remote work or study
29         3      1                                         Doing exercise,Watching TV i.e. not the news,Doing remote work or study
30         3      2                         Doing exercise,Watching TV i.e. not the news,Doing housework,Doing remote work or study
31         3      3                                            Doing exercise,Watching/ Reading the news,Doing remote work or study
32         3      6                                                                                      Doing remote work or study
33         3     10                                                           Watching/ Reading the news,Doing remote work or study
34         3     15                                                           Watching/ Reading the news,Doing remote work or study
35         3     31                                                                      Doing housework,Doing remote work or study
36         3     37                                                                       Doing exercise,Doing remote work or study
37         3     47                                        Watching TV i.e. not the news,Doing housework,Doing remote work or study
38         3     53                                                                      Doing housework,Doing remote work or study
39         3     57               Watching TV i.e. not the news,Talking to friends/ family,Doing remote work or study,Playing music
40         3     59                              Doing exercise,Watching/ Reading the news,Doing remote work or study,Playing music
41         3     63                                                           Watching/ Reading the news,Doing remote work or study
42         3     64                 Doing exercise,Watching/ Reading the news,Talking to friends/ family,Doing remote work or study
43         3     65                             Watching TV i.e. not the news,Talking to friends/ family,Doing remote work or study
44         3     69                             Watching TV i.e. not the news,Talking to friends/ family,Doing remote work or study
45         3     74  Watching TV i.e. not the news,Watching/ Reading the news,Talking to friends/ family,Doing remote work or study
46         3     78             Watching TV i.e. not the news,Watching/ Reading the news,Doing housework,Doing remote work or study
47         3     79                                                       Doing exercise,Doing housework,Doing remote work or study
48         3     80                                          Watching TV i.e. not the news,Doing remote work or study,Playing music
49         3     81              Doing exercise,Watching TV i.e. not the news,Watching/ Reading the news,Doing remote work or study
50         3     84                                                           Watching/ Reading the news,Doing remote work or study
51         3     86                                                                                      Doing remote work or study
52         3     87              Doing exercise,Watching TV i.e. not the news,Talking to friends/ family,Doing remote work or study
53         3     93                                            Doing exercise,Watching/ Reading the news,Doing remote work or study
54         3     94                 Doing exercise,Watching/ Reading the news,Talking to friends/ family,Doing remote work or study
55         3     97                Watching/ Reading the news,Doing housework,Talking to friends/ family,Doing remote work or study
56         3     99             Watching TV i.e. not the news,Doing housework,Talking to friends/ family,Doing remote work or study
57         3    100               Watching TV i.e. not the news,Watching/ Reading the news,Doing remote work or study,Playing music
58         4      5                                                           Watching/ Reading the news,Talking to friends/ family
59         4     11                                                                                      Talking to friends/ family
60         4     14                                         Doing exercise,Watching TV i.e. not the news,Talking to friends/ family
61         4     16                                                                                      Talking to friends/ family
62         4     18                                                                                      Talking to friends/ family
63         4     21                                           Watching/ Reading the news,Doing housework,Talking to friends/ family
64         4     24                                                           Watching/ Reading the news,Talking to friends/ family
65         4     38                                            Doing exercise,Watching/ Reading the news,Talking to friends/ family
66         4     40                                                        Watching TV i.e. not the news,Talking to friends/ family
67         4     45                                                           Watching/ Reading the news,Talking to friends/ family
68         4     51                                                           Watching/ Reading the news,Talking to friends/ family
69         4     52                                                                                      Talking to friends/ family
70         4     58                                            Doing exercise,Watching/ Reading the news,Talking to friends/ family
71         4     60                                                           Watching/ Reading the news,Talking to friends/ family
72         4     67                                         Doing exercise,Watching TV i.e. not the news,Talking to friends/ family
73         4     73               Watching TV i.e. not the news,Watching/ Reading the news,Talking to friends/ family,Playing music
74         4     76                                          Watching TV i.e. not the news,Talking to friends/ family,Playing music
75         4     88                                                                                      Talking to friends/ family
76         5     19                                                                                      Watching/ Reading the news
77         5     20                                                                                      Watching/ Reading the news
78         5     27                                                                       Doing exercise,Watching/ Reading the news
79         5     30                                                        Watching TV i.e. not the news,Watching/ Reading the news
80         5     32                                                                                      Watching/ Reading the news
81         5     46                                                                                   Watching TV i.e. not the news
82         5     82                                                                                   Watching TV i.e. not the news
83         5     89                                                        Watching TV i.e. not the news,Watching/ Reading the news
84         5     90                                                        Watching TV i.e. not the news,Watching/ Reading the news
85         6      4                                      Doing exercise,Watching TV i.e. not the news,Doing housework,Playing music
86         6      7                                                                                                   Playing music
87         6      9                                                                                  Doing exercise,Doing housework
88         6     23                                                                                                 Doing housework
89         6     28                                                                                                   Playing music
90         6     34                                                                                    Doing exercise,Playing music
91         6     36                                                                        Talking to friends/ family,Playing music
92         6     39                                                                      Watching/ Reading the news,Doing housework
93         6     44                                                       Doing exercise,Watching/ Reading the news,Doing housework
94         6     48                                                                                                   Playing music
95         6     50                                                                                                  Doing exercise
96         6     55                                                                        Talking to friends/ family,Playing music
97         6     68                                                                        Talking to friends/ family,Playing music
98         6     83                                                                        Watching/ Reading the news,Playing music
99         6     92                                                                                  Doing exercise,Doing housework
100        6     96                                         Doing exercise,Doing housework,Talking to friends/ family,Playing music
101        6     98                                         Doing exercise,Watching/ Reading the news,Doing housework,Playing music


#Plot final cluster
#There are 6 clusters, cluster 3 is the mode i.e. highest frequency count 

import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(20,20))  

sns.displot(df_with_cluster, x="Cluster")

<seaborn.axisgrid.FacetGrid at 0x7ff613ae9280>

<Figure size 2000x2000 with 0 Axes>


#******************************************************************************************************************
#Which of the psychological reactions mentioned in Q10 occurred with our respondents, during the Covid 19 lockdown?
#******************************************************************************************************************

"""We will use a clustering algorithm again, to find similar data/ answer question.  In the bar chart below, we can 
    infer that the counts of cluster 3 is higher.  The psychological reactions associated with Cluster 3 consists
    of Changes in appetite, energy and activity levels, difficulty sleeping or nightmares and concentrating and 
    making decisions - refer to dataframe below, for verification. 
    Hence, we can infer that during Covid 19 lockdown, most of our respondents suffered from the aforementioned
    psychological reactions.  
    """

proc_wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

def text_normalization(col):

    col = re.sub(r'[^a-zA-Z\s]', '', col, re.I|re.A)
    col = col.lower()
    col = col.strip()

    tokens_no = proc_wpt.tokenize(col)

    filter_tokens = [token for token in tokens_no if token not in stop_words]

    col = ' '.join(filter_tokens)
    return col

normalize = np.vectorize(text_normalization)

normalize_data = normalize(df['Psychological_Reactions'])
normalize_data

array(['difficulty concentrating making decisionsdifficulty sleeping nightmaresphysical reactions headaches body pains stomach problems skin rashesincreased use alcohol tobacco drugs',
       'changes appetite energy activity levelsdifficulty concentrating making decisions',
       'changes appetite energy activity levelsdifficulty concentrating making decisionsworsening chronic health problemsincreased use alcohol tobacco drugs',
       'changes appetite energy activity levelsdifficulty sleeping nightmaresworsening chronic health problemsincreased use alcohol tobacco drugs',
       'changes appetite energy activity levelsdifficulty concentrating making decisionsphysical reactions headaches body pains stomach problems skin rashesworsening chronic health problems',
       'changes appetite energy activity levelsdifficulty concentrating making decisions',
       'changes appetite energy activity levelsphysical reactions headaches body pains stomach problems skin rashesworsening chronic health problems',
       'difficulty concentrating making decisionsdifficulty sleeping nightmaresincreased use alcohol tobacco drugs',
       'difficulty sleeping nightmaresphysical reactions headaches body pains stomach problems skin rashes',
       'none', 'difficulty concentrating making decisions',
       'increased use alcohol tobacco drugs',
       'difficulty sleeping nightmaresphysical reactions headaches body pains stomach problems skin rashes',
       'none', 'changes appetite energy activity levels',
       'changes appetite energy activity levelsdifficulty concentrating making decisionsdifficulty sleeping nightmares',
       'physical reactions headaches body pains stomach problems skin rashesworsening chronic health problems',
       'difficulty concentrating making decisions',
       'changes appetite energy activity levels',
       'difficulty concentrating making decisions',
       'difficulty concentrating making decisions', 'none',
       'changes appetite energy activity levelsphysical reactions headaches body pains stomach problems skin rashes',
       'difficulty concentrating making decisions',
       'difficulty sleeping nightmares', 'none',
       'increased use alcohol tobacco drugs', 'none',
       'changes appetite energy activity levels',
       'difficulty concentrating making decisions',
       'difficulty concentrating making decisions',
       'changes appetite energy activity levels',
       'changes appetite energy activity levelsdifficulty sleeping nightmaresincreased use alcohol tobacco drugs',
       'none', 'changes appetite energy activity levels', 'none',
       'physical reactions headaches body pains stomach problems skin rashes',
       'none',
       'changes appetite energy activity levelsworsening chronic health problems',
       'physical reactions headaches body pains stomach problems skin rashes',
       'none', 'increased use alcohol tobacco drugs', 'none',
       'changes appetite energy activity levelsdifficulty sleeping nightmares',
       'changes appetite energy activity levelsdifficulty concentrating making decisions',
       'physical reactions headaches body pains stomach problems skin rashes',
       'difficulty concentrating making decisions',
       'changes appetite energy activity levelsdifficulty concentrating making decisionsdifficulty sleeping nightmaresphysical reactions headaches body pains stomach problems skin rashes',
       'difficulty concentrating making decisions', 'none',
       'difficulty sleeping nightmares',
       'difficulty concentrating making decisionsphysical reactions headaches body pains stomach problems skin rashes',
       'physical reactions headaches body pains stomach problems skin rashes',
       'changes appetite energy activity levelsdifficulty sleeping nightmares',
       'worsening chronic health problems',
       'worsening chronic health problems',
       'difficulty concentrating making decisionsworsening chronic health problems',
       'changes appetite energy activity levelsdifficulty concentrating making decisionsphysical reactions headaches body pains stomach problems skin rashes',
       'difficulty concentrating making decisionsdifficulty sleeping nightmaresworsening chronic health problems',
       'difficulty concentrating making decisionsphysical reactions headaches body pains stomach problems skin rashesincreased use alcohol tobacco drugs',
       'none', 'none',
       'changes appetite energy activity levelsdifficulty concentrating making decisions',
       'difficulty concentrating making decisionsphysical reactions headaches body pains stomach problems skin rashesworsening chronic health problems',
       'changes appetite energy activity levelsdifficulty concentrating making decisions',
       'none',
       'difficulty concentrating making decisionsdifficulty sleeping nightmares',
       'none', 'none', 'none', 'none', 'none',
       'difficulty sleeping nightmaresincreased use alcohol tobacco drugs',
       'difficulty concentrating making decisionsdifficulty sleeping nightmares',
       'difficulty concentrating making decisionsdifficulty sleeping nightmaresworsening chronic health problems',
       'difficulty concentrating making decisionsdifficulty sleeping nightmaresworsening chronic health problems',
       'difficulty sleeping nightmaresphysical reactions headaches body pains stomach problems skin rashesincreased use alcohol tobacco drugs',
       'changes appetite energy activity levelsdifficulty concentrating making decisions',
       'changes appetite energy activity levelsdifficulty concentrating making decisionsdifficulty sleeping nightmares',
       'difficulty sleeping nightmaresincreased use alcohol tobacco drugs',
       'physical reactions headaches body pains stomach problems skin rashesincreased use alcohol tobacco drugs',
       'difficulty concentrating making decisions',
       'changes appetite energy activity levels',
       'difficulty sleeping nightmaresphysical reactions headaches body pains stomach problems skin rashes',
       'changes appetite energy activity levelsdifficulty sleeping nightmares',
       'physical reactions headaches body pains stomach problems skin rashes',
       'changes appetite energy activity levels',
       'difficulty concentrating making decisions', 'none',
       'physical reactions headaches body pains stomach problems skin rashes',
       'difficulty sleeping nightmaresphysical reactions headaches body pains stomach problems skin rashes',
       'none', 'changes appetite energy activity levels',
       'difficulty concentrating making decisionsdifficulty sleeping nightmares',
       'difficulty concentrating making decisionsdifficulty sleeping nightmares',
       'none',
       'changes appetite energy activity levelsdifficulty concentrating making decisions',
       'difficulty concentrating making decisions',
       'increased use alcohol tobacco drugs', 'none',
       'difficulty concentrating making decisionsdifficulty sleeping nightmaresnone',
       'difficulty concentrating making decisions'], dtype='<U181')


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


count_vector = CountVectorizer(min_df=0., max_df=1.)
count_vector_matrix = count_vector.fit_transform(normalize_data)
count_vector_matrix = count_vector_matrix.toarray()
count_vector_matrix

array([[0, 1, 0, ..., 1, 1, 0],
       [1, 0, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 1, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])


feature_names = count_vector.get_feature_names_out()
pd.DataFrame(count_vector_matrix, columns=feature_names)


vector = CountVectorizer(ngram_range=(2,2))
vector_matrix = vector.fit_transform(normalize_data)
vector_matrix = vector_matrix.toarray()
feature_names = vector.get_feature_names_out()
pd.DataFrame(vector_matrix, columns=feature_names)


from sklearn.feature_extraction.text import TfidfVectorizer
Tfid_v = TfidfVectorizer(min_df=0., max_df=1., use_idf=True)
Tfid_v_matrix = Tfid_v.fit_transform(normalize_data)
Tfid_v_matrix = Tfid_v_matrix.toarray()

feature_names = Tfid_v.get_feature_names_out()
pd.DataFrame(np.round(Tfid_v_matrix, 2), columns=feature_names)


similar = cosine_similarity(Tfid_v_matrix)
df_of_similar = pd.DataFrame(similar)
df_of_similar


#Plot dendrogram
#The number of clusters will be the number of vertical lines which are being intersected by the line drawn using 
#the threshold i.e. 3 clusters

##In a dendrogram, there are clades which indicate each branch, leaves which are the terminal end of each clade, 
#and outliers. Outliers are clades with only one leaf.

#We have a clade with only one leaf, in our dendogram below - blue branch and single leaf on extreme left hand side

from scipy.cluster.hierarchy import dendrogram, linkage
Z = linkage(similar, 'ward')

plt.figure(figsize=(15,15))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Supported Activities')
plt.ylabel('Distance')
dendrogram(Z)
plt.axhline(y=20, c='k', ls='--', lw=10)

<matplotlib.lines.Line2D at 0x7ff603ebe4c0>


#Dataframe with key variables; we're interested in the 'Psychological_Reactions' associated with Cluster 3

from scipy.cluster.hierarchy import fcluster
distance = 8

clustering = fcluster(Z, distance, criterion='distance')
clustering = pd.DataFrame(clustering, columns=['Cluster'])
new_dataframe_02 = pd.concat([df, clustering], axis=1)
new_dataframe_02


data = new_dataframe_02.groupby('Cluster').groups
data

{1: [9, 13, 21, 25, 27, 33, 35, 37, 40, 42, 49, 60, 61, 65, 67, 68, 69, 70, 71, 88, 91, 95, 99], 2: [0, 4, 6, 8, 12, 16, 22, 36, 39, 45, 47, 51, 52, 57, 59, 63, 76, 80, 83, 85, 89, 90], 3: [1, 2, 3, 5, 14, 15, 18, 28, 31, 32, 34, 38, 43, 44, 53, 62, 64, 77, 78, 82, 84, 86, 92, 96], 4: [10, 17, 19, 20, 23, 29, 30, 46, 48, 81, 87, 97, 101], 5: [7, 11, 26, 41, 72, 79, 98], 6: [24, 50, 54, 55, 56, 58, 66, 73, 74, 75, 93, 94, 100]}


ar1 = []
ar2 = []
ar3 = []

for x, y in data.items():
    for i in range(len(y)):
        
        ar1.append(x)
        ar2.append(y[i])
        result1 = new_dataframe_02['Psychological_Reactions'].loc[y[i]]
        ar3.append(result1)
        
list_tuples = list(zip(ar1, ar2, ar3)) 
    

list_tuples  
  

df_with_cluster_02= pd.DataFrame(list_tuples,
                  columns = ['Cluster', 'Index','Psychological_Reactions']) 
     
# Print data
#df_with_cluster_02.to_csv('survey0011.csv')

print(df_with_cluster_02.to_string())

     Cluster  Index                                                                                                                                                                                                 Psychological_Reactions
0          1      9                                                                                                                                                                                                       None of the above
1          1     13                                                                                                                                                                                                       None of the above
2          1     21                                                                                                                                                                                                       None of the above
3          1     25                                                                                                                                                                                                       None of the above
4          1     27                                                                                                                                                                                                       None of the above
5          1     33                                                                                                                                                                                                       None of the above
6          1     35                                                                                                                                                                                                       None of the above
7          1     37                                                                                                                                                                                                       None of the above
8          1     40                                                                                                                                                                                                       None of the above
9          1     42                                                                                                                                                                                                       None of the above
10         1     49                                                                                                                                                                                                       None of the above
11         1     60                                                                                                                                                                                                       None of the above
12         1     61                                                                                                                                                                                                       None of the above
13         1     65                                                                                                                                                                                                       None of the above
14         1     67                                                                                                                                                                                                       None of the above
15         1     68                                                                                                                                                                                                       None of the above
16         1     69                                                                                                                                                                                                       None of the above
17         1     70                                                                                                                                                                                                       None of the above
18         1     71                                                                                                                                                                                                       None of the above
19         1     88                                                                                                                                                                                                       None of the above
20         1     91                                                                                                                                                                                                       None of the above
21         1     95                                                                                                                                                                                                       None of the above
22         1     99                                                                                                                                                                                                       None of the above
23         2      0    Difficulty concentrating and making decisions,Difficulty sleeping or nightmares,Physical reactions, such as headaches, body pains, stomach problems and skin rashes,Increased use of alcohol, tobacco or other drugs
24         2      4  Changes in appetite, energy and activity levels,Difficulty concentrating and making decisions,Physical reactions, such as headaches, body pains, stomach problems and skin rashes,Worsening of chronic health problems
25         2      6                                                Changes in appetite, energy and activity levels,Physical reactions, such as headaches, body pains, stomach problems and skin rashes,Worsening of chronic health problems
26         2      8                                                                                                   Difficulty sleeping or nightmares,Physical reactions, such as headaches, body pains, stomach problems and skin rashes
27         2     12                                                                                                   Difficulty sleeping or nightmares,Physical reactions, such as headaches, body pains, stomach problems and skin rashes
28         2     16                                                                                                Physical reactions, such as headaches, body pains, stomach problems and skin rashes,Worsening of chronic health problems
29         2     22                                                                                     Changes in appetite, energy and activity levels,Physical reactions, such as headaches, body pains, stomach problems and skin rashes
30         2     36                                                                                                                                     Physical reactions, such as headaches, body pains, stomach problems and skin rashes
31         2     39                                                                                                                                     Physical reactions, such as headaches, body pains, stomach problems and skin rashes
32         2     45                                                                                                                                     Physical reactions, such as headaches, body pains, stomach problems and skin rashes
33         2     47     Changes in appetite, energy and activity levels,Difficulty concentrating and making decisions,Difficulty sleeping or nightmares,Physical reactions, such as headaches, body pains, stomach problems and skin rashes
34         2     51                                                                                       Difficulty concentrating and making decisions,Physical reactions, such as headaches, body pains, stomach problems and skin rashes
35         2     52                                                                                                                                     Physical reactions, such as headaches, body pains, stomach problems and skin rashes
36         2     57                                       Changes in appetite, energy and activity levels,Difficulty concentrating and making decisions,Physical reactions, such as headaches, body pains, stomach problems and skin rashes
37         2     59                                      Difficulty concentrating and making decisions,Physical reactions, such as headaches, body pains, stomach problems and skin rashes,Increased use of alcohol, tobacco or other drugs
38         2     63                                                  Difficulty concentrating and making decisions,Physical reactions, such as headaches, body pains, stomach problems and skin rashes,Worsening of chronic health problems
39         2     76                                                  Difficulty sleeping or nightmares,Physical reactions, such as headaches, body pains, stomach problems and skin rashes,Increased use of alcohol, tobacco or other drugs
40         2     80                                                                                    Physical reactions, such as headaches, body pains, stomach problems and skin rashes,Increased use of alcohol, tobacco or other drugs
41         2     83                                                                                                   Difficulty sleeping or nightmares,Physical reactions, such as headaches, body pains, stomach problems and skin rashes
42         2     85                                                                                                                                     Physical reactions, such as headaches, body pains, stomach problems and skin rashes
43         2     89                                                                                                                                     Physical reactions, such as headaches, body pains, stomach problems and skin rashes
44         2     90                                                                                                   Difficulty sleeping or nightmares,Physical reactions, such as headaches, body pains, stomach problems and skin rashes
45         3      1                                                                                                                           Changes in appetite, energy and activity levels,Difficulty concentrating and making decisions
46         3      2                                     Changes in appetite, energy and activity levels,Difficulty concentrating and making decisions,Worsening of chronic health problems,Increased use of alcohol, tobacco or other drugs
47         3      3                                                 Changes in appetite, energy and activity levels,Difficulty sleeping or nightmares,Worsening of chronic health problems,Increased use of alcohol, tobacco or other drugs
48         3      5                                                                                                                           Changes in appetite, energy and activity levels,Difficulty concentrating and making decisions
49         3     14                                                                                                                                                                         Changes in appetite, energy and activity levels
50         3     15                                                                                         Changes in appetite, energy and activity levels,Difficulty concentrating and making decisions,Difficulty sleeping or nightmares
51         3     18                                                                                                                                                                         Changes in appetite, energy and activity levels
52         3     28                                                                                                                                                                         Changes in appetite, energy and activity levels
53         3     31                                                                                                                                                                         Changes in appetite, energy and activity levels
54         3     32                                                                                      Changes in appetite, energy and activity levels,Difficulty sleeping or nightmares,Increased use of alcohol, tobacco or other drugs
55         3     34                                                                                                                                                                         Changes in appetite, energy and activity levels
56         3     38                                                                                                                                    Changes in appetite, energy and activity levels,Worsening of chronic health problems
57         3     43                                                                                                                                       Changes in appetite, energy and activity levels,Difficulty sleeping or nightmares
58         3     44                                                                                                                           Changes in appetite, energy and activity levels,Difficulty concentrating and making decisions
59         3     53                                                                                                                                       Changes in appetite, energy and activity levels,Difficulty sleeping or nightmares
60         3     62                                                                                                                           Changes in appetite, energy and activity levels,Difficulty concentrating and making decisions
61         3     64                                                                                                                           Changes in appetite, energy and activity levels,Difficulty concentrating and making decisions
62         3     77                                                                                                                           Changes in appetite, energy and activity levels,Difficulty concentrating and making decisions
63         3     78                                                                                         Changes in appetite, energy and activity levels,Difficulty concentrating and making decisions,Difficulty sleeping or nightmares
64         3     82                                                                                                                                                                         Changes in appetite, energy and activity levels
65         3     84                                                                                                                                       Changes in appetite, energy and activity levels,Difficulty sleeping or nightmares
66         3     86                                                                                                                                                                         Changes in appetite, energy and activity levels
67         3     92                                                                                                                                                                         Changes in appetite, energy and activity levels
68         3     96                                                                                                                           Changes in appetite, energy and activity levels,Difficulty concentrating and making decisions
69         4     10                                                                                                                                                                           Difficulty concentrating and making decisions
70         4     17                                                                                                                                                                           Difficulty concentrating and making decisions
71         4     19                                                                                                                                                                           Difficulty concentrating and making decisions
72         4     20                                                                                                                                                                           Difficulty concentrating and making decisions
73         4     23                                                                                                                                                                           Difficulty concentrating and making decisions
74         4     29                                                                                                                                                                           Difficulty concentrating and making decisions
75         4     30                                                                                                                                                                           Difficulty concentrating and making decisions
76         4     46                                                                                                                                                                           Difficulty concentrating and making decisions
77         4     48                                                                                                                                                                           Difficulty concentrating and making decisions
78         4     81                                                                                                                                                                           Difficulty concentrating and making decisions
79         4     87                                                                                                                                                                           Difficulty concentrating and making decisions
80         4     97                                                                                                                                                                           Difficulty concentrating and making decisions
81         4    101                                                                                                                                                                           Difficulty concentrating and making decisions
82         5      7                                                                                        Difficulty concentrating and making decisions,Difficulty sleeping or nightmares,Increased use of alcohol, tobacco or other drugs
83         5     11                                                                                                                                                                        Increased use of alcohol, tobacco or other drugs
84         5     26                                                                                                                                                                        Increased use of alcohol, tobacco or other drugs
85         5     41                                                                                                                                                                        Increased use of alcohol, tobacco or other drugs
86         5     72                                                                                                                                      Difficulty sleeping or nightmares,Increased use of alcohol, tobacco or other drugs
87         5     79                                                                                                                                      Difficulty sleeping or nightmares,Increased use of alcohol, tobacco or other drugs
88         5     98                                                                                                                                                                        Increased use of alcohol, tobacco or other drugs
89         6     24                                                                                                                                                                                       Difficulty sleeping or nightmares
90         6     50                                                                                                                                                                                       Difficulty sleeping or nightmares
91         6     54                                                                                                                                                                                    Worsening of chronic health problems
92         6     55                                                                                                                                                                                    Worsening of chronic health problems
93         6     56                                                                                                                                      Difficulty concentrating and making decisions,Worsening of chronic health problems
94         6     58                                                                                                    Difficulty concentrating and making decisions,Difficulty sleeping or nightmares,Worsening of chronic health problems
95         6     66                                                                                                                                         Difficulty concentrating and making decisions,Difficulty sleeping or nightmares
96         6     73                                                                                                                                         Difficulty concentrating and making decisions,Difficulty sleeping or nightmares
97         6     74                                                                                                    Difficulty concentrating and making decisions,Difficulty sleeping or nightmares,Worsening of chronic health problems
98         6     75                                                                                                    Difficulty concentrating and making decisions,Difficulty sleeping or nightmares,Worsening of chronic health problems
99         6     93                                                                                                                                         Difficulty concentrating and making decisions,Difficulty sleeping or nightmares
100        6     94                                                                                                                                         Difficulty concentrating and making decisions,Difficulty sleeping or nightmares
101        6    100                                                                                                                       Difficulty concentrating and making decisions,Difficulty sleeping or nightmares,None of the above


#Plot final cluster
#There are 6 clusters, cluster 3 is the mode i.e. highest frequency count 

import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(20,20))  

sns.displot(df_with_cluster_02, x="Cluster")

<seaborn.axisgrid.FacetGrid at 0x7ff5c1ec1550>

<Figure size 2000x2000 with 0 Axes>


%%js

// Run this cell to update your word count.

function wordcount() {
    let wordCount = 0
    let appendixCount = 0
    let appendix = false

    let cells = Jupyter.notebook.get_cells()
    cells.forEach((cell) => {
        if (cell.cell_type == 'markdown') {
            let text = cell.get_text()
            // Stop counting when get to Appendices.
            if (text.startsWith('## Appendices')) {
                appendix = true
            }
            if (text.startsWith('## Word Count')) {
                text = ''
            }
            if (text) {
                let cellCount = text.toLowerCase().match(/\b[a-z\d]+\b/g).length
                if (!appendix) {
                    wordCount += cellCount
                } else {
                    appendixCount += cellCount
                }
            }
        }
    })
    return [wordCount, appendixCount]
}

let wc = wordcount()
element.append(`Main word count: ${wc[0]} (Appendices word count: ${wc[1]})`)

SOM_Affected	No	Yes
Region
Africa	21.052632	78.947368
North America	22.222222	77.777778
Asia	29.411765	70.588235
South America	30.769231	69.230769
Oceania	33.333333	66.666667
Europe	34.615385	65.384615

SOM_Affected	No	Yes
Age
Under 18	0.000000	100.000000
18-24	22.222222	77.777778
25-34	28.125000	71.875000
45-54	30.000000	70.000000
35-44	31.428571	68.571429
55-64	33.333333	66.666667

	Age	Gender	Impact on Communication	Impact on Financial Status	Unhappy_or_Depressed	Study or Work Satisfaction
Age	1.000000	-0.132517	-0.020084	0.016727	0.212969	0.086226
Gender	-0.132517	1.000000	-0.040468	-0.163612	-0.017754	-0.005663
Impact on Communication	-0.020084	-0.040468	1.000000	-0.112090	0.009470	-0.051955
Impact on Financial Status	0.016727	-0.163612	-0.112090	1.000000	0.105498	-0.112715
Unhappy_or_Depressed	0.212969	-0.017754	0.009470	0.105498	1.000000	-0.128693
Study or Work Satisfaction	0.086226	-0.005663	-0.051955	-0.112715	-0.128693	1.000000

	0	1	2	3	4	5	6	7	8	9	...	92	93	94	95	96	97	98	99	100	101
0	1.000000	0.498775	0.430419	0.673344	0.164119	0.434152	0.546106	0.000000	0.071785	0.0	...	0.0	0.673344	1.000000	0.400388	0.079342	0.582514	0.276027	0.461073	0.225632	0.425897
1	0.498775	1.000000	0.862953	0.774599	0.588959	0.000000	0.567257	0.000000	0.211188	0.0	...	0.0	0.774599	0.498775	0.169360	0.000000	0.425499	0.350102	0.642627	0.527706	0.180150
2	0.430419	0.862953	1.000000	0.668443	0.508244	0.000000	0.489516	0.000000	0.182246	0.0	...	0.0	0.668443	0.430419	0.146149	0.000000	0.367186	0.302122	0.554557	0.455386	0.155461
3	0.673344	0.774599	0.668443	1.000000	0.360587	0.143804	0.633510	0.000000	0.083274	0.0	...	0.0	1.000000	0.673344	0.189140	0.000000	0.578576	0.504758	0.444064	0.415581	0.201190
4	0.164119	0.588959	0.508244	0.360587	1.000000	0.000000	0.000000	0.170790	0.183394	0.0	...	0.0	0.360587	0.164119	0.122269	0.118998	0.130403	0.833774	0.334619	0.436368	0.130058
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
97	0.582514	0.425499	0.367186	0.578576	0.130403	0.323364	0.485376	0.000000	0.131105	0.0	...	0.0	0.578576	0.582514	0.636750	0.343894	1.000000	0.228563	0.860468	0.395050	0.378535
98	0.276027	0.350102	0.302122	0.504758	0.833774	0.121247	0.000000	0.185191	0.070211	0.0	...	0.0	0.504758	0.276027	0.132579	0.129033	0.228563	1.000000	0.132135	0.326661	0.141025
99	0.461073	0.642627	0.554557	0.444064	0.334619	0.199219	0.453577	0.000000	0.231759	0.0	...	0.0	0.444064	0.461073	0.595034	0.321364	0.860468	0.132135	1.000000	0.493575	0.353735
100	0.225632	0.527706	0.455386	0.415581	0.436368	0.207681	0.263471	0.154370	0.448947	0.0	...	0.0	0.415581	0.225632	0.431309	0.107558	0.395050	0.326661	0.493575	1.000000	0.458788
101	0.425897	0.180150	0.155461	0.201190	0.130058	0.107210	0.317580	0.186073	0.000000	0.0	...	0.0	0.201190	0.425897	0.753134	0.207619	0.378535	0.141025	0.353735	0.458788	1.000000

	activity	alcohol	appetite	body	changes	chronic	concentrating	decisions	decisionsdifficulty	decisionsphysical	...	rashes	rashesincreased	rashesworsening	reactions	skin	sleeping	stomach	tobacco	use	worsening
0	0.00	0.26	0.00	0.22	0.00	0.00	0.17	0.00	0.26	0.00	...	0.0	0.35	0.00	0.22	0.22	0.20	0.22	0.26	0.26	0.0
1	0.35	0.00	0.35	0.00	0.35	0.00	0.30	0.40	0.00	0.00	...	0.0	0.00	0.00	0.00	0.00	0.00	0.00	0.00	0.00	0.0
2	0.20	0.26	0.20	0.00	0.20	0.26	0.17	0.00	0.00	0.00	...	0.0	0.00	0.00	0.00	0.00	0.00	0.00	0.26	0.26	0.0
3	0.20	0.27	0.20	0.00	0.20	0.27	0.00	0.00	0.00	0.00	...	0.0	0.00	0.00	0.00	0.00	0.21	0.00	0.27	0.27	0.0
4	0.19	0.00	0.19	0.21	0.19	0.25	0.16	0.00	0.00	0.32	...	0.0	0.00	0.34	0.21	0.21	0.00	0.21	0.00	0.00	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
97	0.00	0.00	0.00	0.00	0.00	0.00	0.45	0.61	0.00	0.00	...	0.0	0.00	0.00	0.00	0.00	0.00	0.00	0.00	0.00	0.0
98	0.00	0.41	0.00	0.00	0.00	0.00	0.00	0.00	0.00	0.00	...	0.0	0.00	0.00	0.00	0.00	0.00	0.00	0.41	0.41	0.0
99	0.00	0.00	0.00	0.00	0.00	0.00	0.00	0.00	0.00	0.00	...	0.0	0.00	0.00	0.00	0.00	0.00	0.00	0.00	0.00	0.0
100	0.00	0.00	0.00	0.00	0.00	0.00	0.27	0.00	0.42	0.00	...	0.0	0.00	0.00	0.00	0.00	0.33	0.00	0.00	0.00	0.0
101	0.00	0.00	0.00	0.00	0.00	0.00	0.45	0.61	0.00	0.00	...	0.0	0.00	0.00	0.00	0.00	0.00	0.00	0.00	0.00	0.0

	Unique Response Number	1. To what age group do you belong?	2. Which gender do you most strongly identify with?	3. What is your region of residence?	4. Has the covid-19 lockdown affected your state of mind in any way?	5. Do you share a home with any of the following?	5.a. If you selected Other, please specify:	6. If you compare the pre-COVID-19 lockdown and the actual COVID-19 lockdown, how has:	6.1. Your amount of communication with family and friends changed?	6.2. Your financial status changed?	...	8.3. How satisfied were you with new activities?	9. How many hours per day are/ were you able to spend outside, on average during the lockdown?	10. Select if you have had any of these psychological reactions towards the covid-19 pandemic?	10.a. If you selected Other, please specify:	11. During the Covid 19 lockdown, which of the following helped you feel better?	11.a. If you selected Other, please specify:	12. Have you been tempted to access support from social groups during the covid-19 pandemic?	12.a. If you selected Other, please specify:	13. Do you feel that the Covid 19 pandemic has affected your access to healthcare for other conditions?	13.a. If you selected Other, please specify:
0	753624-753615-79015877	35-44	Male	Africa	Yes	Your partner,Other Family Memebers	NaN	NaN	More Than Usual	Less Than Usual	...	Satisfied	2-3 hours per day	Difficulty concentrating and making decisions,Difficulty sleeping or nightmares,Physical reactions, such as headaches, body pains, stomach problems and skin rashes,Increased use of alcohol, tobacc...	NaN	Doing exercise,Watching/ Reading the news,Talking to friends/ family,Doing remote work or study	NaN	I don't think I need social support	NaN	I didn't need access to healthcare during Covid 19	NaN
1	753624-753615-79019215	35-44	Male	Europe	Yes	No one, I live alone	NaN	NaN	More Than Usual	Same as Usual	...	Unsatisfied	0-1 hours per day	Changes in appetite, energy and activity levels,Difficulty concentrating and making decisions	NaN	Doing exercise,Watching TV i.e. not the news,Doing remote work or study	NaN	I don't think I need social support	NaN	I didn't need access to healthcare during Covid 19	NaN
2	753624-753615-79015968	35-44	Male	Asia	Yes	Your partner	NaN	NaN	More Than Usual	More Than Usual	...	Neutral	0-1 hours per day	Changes in appetite, energy and activity levels,Difficulty concentrating and making decisions,Worsening of chronic health problems,Increased use of alcohol, tobacco or other drugs	NaN	Doing exercise,Watching TV i.e. not the news,Doing housework,Doing remote work or study	NaN	No - I became unable to use the social support groups I was using before the pandemic	NaN	I couldn't find over the counter medication in pharmacies or shops	NaN
3	753624-753615-79020017	35-44	Male	Europe	Yes	Your partner,Your child/ children,Other Family Memebers	NaN	NaN	More Than Usual	Less Than Usual	...	Neutral	0-1 hours per day	Changes in appetite, energy and activity levels,Difficulty sleeping or nightmares,Worsening of chronic health problems,Increased use of alcohol, tobacco or other drugs	NaN	Doing exercise,Watching/ Reading the news,Doing remote work or study	NaN	No - I think I need social support but cannot access them to begin with	NaN	I needed a GP appointment and couldn't get one at all	NaN
4	753624-753615-79020109	Under 18	Male	Asia	Yes	Other Family Memebers	NaN	NaN	More Than Usual	Same as Usual	...	Unsatisfied	0-1 hours per day	Changes in appetite, energy and activity levels,Difficulty concentrating and making decisions,Physical reactions, such as headaches, body pains, stomach problems and skin rashes,Worsening of chron...	NaN	Doing exercise,Watching TV i.e. not the news,Doing housework,Playing music	NaN	No - I think I need social support but cannot access them to begin with	NaN	I didn't need access to healthcare during Covid 19	NaN
97	753624-753615-79060123	Under 18	Male	Africa	Yes	Other Family Memebers,One or multiple housemates	NaN	NaN	Same as Usual	Less Than Usual	...	Unsatisfied	0-1 hours per day	Difficulty concentrating and making decisions	NaN	Watching/ Reading the news,Doing housework,Talking to friends/ family,Doing remote work or study	NaN	No - I think I need social support but cannot access them to begin with	NaN	I didn't need access to healthcare during Covid 19	NaN
98	753624-753615-79060180	25-34	Female	Africa	No	No one, I live alone	NaN	NaN	More Than Usual	More Than Usual	...	Neutral	0-1 hours per day	Increased use of alcohol, tobacco or other drugs	NaN	Doing exercise,Watching/ Reading the news,Doing housework,Playing music	NaN	No - I became unable to use the social support groups I was using before the pandemic	NaN	I didn't need access to healthcare during Covid 19	NaN
99	753624-753615-79060308	25-34	Male	Europe	Yes	Other Family Memebers	NaN	NaN	Same as Usual	Same as Usual	...	Neutral	0-1 hours per day	None of the above	NaN	Watching TV i.e. not the news,Doing housework,Talking to friends/ family,Doing remote work or study	NaN	I don't think I need social support	NaN	Planned procedures that I required, were cancelled	NaN
100	753624-753615-79060402	35-44	Female	South America	Yes	Your partner,Your child/ children,Other Family Memebers,One or multiple housemates	NaN	NaN	More Than Usual	Less Than Usual	...	Satisfied	0-1 hours per day	Difficulty concentrating and making decisions,Difficulty sleeping or nightmares,None of the above	NaN	Watching TV i.e. not the news,Watching/ Reading the news,Doing remote work or study,Playing music	NaN	No - I think I need social support but cannot access them to begin with	NaN	Planned procedures that I required, were cancelled	NaN
101	753624-753615-79060522	25-34	Male	Asia	Yes	Your partner,Your child/ children,Other Family Memebers	NaN	NaN	Less Than Usual	Less Than Usual	...	Neutral	0-1 hours per day	Difficulty concentrating and making decisions	NaN	Talking to friends/ family,Doing remote work or study,Playing music	NaN	No - I think I need social support but cannot access them to begin with	NaN	My regular hospital-based treatment was disrupted	NaN

	Unique Response Number	1. To what age group do you belong?	2. Which gender do you most strongly identify with?	3. What is your region of residence?	4. Has the covid-19 lockdown affected your state of mind in any way?	5. Do you share a home with any of the following?	5.a. If you selected Other, please specify:	6. If you compare the pre-COVID-19 lockdown and the actual COVID-19 lockdown, how has:	6.1. Your amount of communication with family and friends changed?	6.2. Your financial status changed?	...	8.3. How satisfied were you with new activities?	9. How many hours per day are/ were you able to spend outside, on average during the lockdown?	10. Select if you have had any of these psychological reactions towards the covid-19 pandemic?	10.a. If you selected Other, please specify:	11. During the Covid 19 lockdown, which of the following helped you feel better?	11.a. If you selected Other, please specify:	12. Have you been tempted to access support from social groups during the covid-19 pandemic?	12.a. If you selected Other, please specify:	13. Do you feel that the Covid 19 pandemic has affected your access to healthcare for other conditions?	13.a. If you selected Other, please specify:
Total_null	0	0	0	0	0	0	102	102	0	0	...	0	0	0	102	0	102	0	102	0	102
Total_percent(%)	0.0	0.0	0.0	0.0	0.0	0.0	100.0	100.0	0.0	0.0	...	0.0	0.0	0.0	100.0	0.0	100.0	0.0	100.0	0.0	100.0
Types	object	object	object	object	object	object	float64	float64	object	object	...	object	object	object	float64	object	float64	object	float64	object	float64

Gender	Female	Male	All
Hours Spent Outside
0-1 hours per day	6	16	22
2-3 hours per day	15	17	32
4-5 hours per day	18	20	38
6-8 hours per day	6	2	8
More than 8 hours per day	1	1	2
All	46	56	102

	0	1	2	3
0	Your partner	Other Family Memebers	None	None
1	No one	I live alone	None	None
2	Your partner	None	None	None
3	Your partner	Your child/ children	Other Family Memebers	None
4	Other Family Memebers	None	None	None
...	...	...	...	...
97	Other Family Memebers	One or multiple housemates	None	None
98	No one	I live alone	None	None
99	Other Family Memebers	None	None	None
100	Your partner	Your child/ children	Other Family Memebers	One or multiple housemates
101	Your partner	Your child/ children	Other Family Memebers	None

		0
id	option
0	0	Your partner
0	1	Other Family Memebers
1	0	No one
1	1	I live alone
2	0	Your partner
...	...	...
100	2	Other Family Memebers
100	3	One or multiple housemates
101	0	Your partner
	1	Your child/ children
	2	Other Family Memebers

	Question	Sometimes	Often	Seldom	Almost_Always	Never	Gender
0	How often did you feel unhappy/ depressed	44	27	24	7	0	Gender
1	How often did you feel under stress	48	24	24	6	0	Gender
2	How often did you feel anxious/ a loss of confidence	43	38	10	6	5	Gender

Age	Under 18	18-24	25-34	35-44	45-54	55-64	All
Social_Support
0	1	9	12	18	14	3	57
1	0	0	6	3	1	0	10
2	2	0	7	10	3	0	22
3	0	0	6	3	2	0	11
4	0	0	1	1	0	0	2
All	3	9	32	35	20	3	102

Gender	Female	Male	All
Healthcare_Access
1	3	5	8
2	2	4	6
0	26	28	54
3	2	2	4
4	2	4	6
5	7	6	13
6	4	7	11
All	46	56	102

	didnt	exercise	exercisedoing	exerciseplaying	exercisewatching	family	familydoing	familyplaying	friends	help	...	playing	reading	remote	study	studyi	studyplaying	talking	tv	watching	work
0	0	0	0	0	1	0	1	0	1	0	...	0	1	1	1	0	0	0	0	0	1
1	0	0	0	0	1	0	0	0	0	0	...	0	0	1	1	0	0	0	1	0	1
2	0	0	0	0	1	0	0	0	0	0	...	0	0	1	1	0	0	0	1	0	1
3	0	0	0	0	1	0	0	0	0	0	...	0	1	1	1	0	0	0	0	0	1
4	0	0	0	0	1	0	0	0	0	0	...	0	0	0	0	0	0	0	1	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
97	0	0	0	0	0	0	1	0	1	0	...	0	1	1	1	0	0	0	0	1	1
98	0	0	0	0	1	0	0	0	0	0	...	0	1	0	0	0	0	0	0	0	0
99	0	0	0	0	0	0	1	0	1	0	...	0	0	1	1	0	0	0	1	1	1
100	0	0	0	0	0	0	0	0	0	0	...	0	1	1	0	0	1	0	1	1	1
101	0	0	0	0	0	0	1	0	1	0	...	0	0	1	0	0	1	1	0	0	1

	didnt need	exercisedoing housework	exercisedoing houseworkdoing	exercisedoing houseworkplaying	exercisedoing houseworktalking	exercisedoing remote	exerciseplaying music	exercisewatching reading	exercisewatching tv	familydoing remote	...	remote work	studyi didnt	studyplaying music	talking friends	tv ie	watching reading	watching tv	work study	work studyi	work studyplaying
0	0	0	0	0	0	0	0	1	0	1	...	1	0	0	0	0	0	0	1	0	0
1	0	0	0	0	0	0	0	0	1	0	...	1	0	0	0	1	0	0	1	0	0
2	0	0	0	0	0	0	0	0	1	0	...	1	0	0	0	1	0	0	1	0	0
3	0	0	0	0	0	0	0	1	0	0	...	1	0	0	0	0	0	0	1	0	0
4	0	0	0	0	0	0	0	0	1	0	...	0	0	0	0	1	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
97	0	0	0	0	0	0	0	0	0	1	...	1	0	0	0	0	1	0	1	0	0
98	0	0	0	0	0	0	0	1	0	0	...	0	0	0	0	0	0	0	0	0	0
99	0	0	0	0	0	0	0	0	0	1	...	1	0	0	0	1	0	1	1	0	0
100	0	0	0	0	0	0	0	0	0	0	...	1	0	1	0	1	0	1	0	0	1
101	0	0	0	0	0	0	0	0	0	1	...	1	0	1	1	0	0	0	0	0	1

	didnt	exercise	exercisedoing	exerciseplaying	exercisewatching	family	familydoing	familyplaying	friends	help	...	playing	reading	remote	study	studyi	studyplaying	talking	tv	watching	work
0	0	0	0	0	1	0	1	0	1	0	...	0	1	1	1	0	0	0	0	0	1
1	0	0	0	0	1	0	0	0	0	0	...	0	0	1	1	0	0	0	1	0	1
2	0	0	0	0	1	0	0	0	0	0	...	0	0	1	1	0	0	0	1	0	1
3	0	0	0	0	1	0	0	0	0	0	...	0	1	1	1	0	0	0	0	0	1
4	0	0	0	0	1	0	0	0	0	0	...	0	0	0	0	0	0	0	1	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
97	0	0	0	0	0	0	1	0	1	0	...	0	1	1	1	0	0	0	0	1	1
98	0	0	0	0	1	0	0	0	0	0	...	0	1	0	0	0	0	0	0	0	0
99	0	0	0	0	0	0	1	0	1	0	...	0	0	1	1	0	0	0	1	1	1
100	0	0	0	0	0	0	0	0	0	0	...	0	1	1	0	0	1	0	1	1	1
101	0	0	0	0	0	0	1	0	1	0	...	0	0	1	0	0	1	1	0	0	1

	didnt need	exercisedoing housework	exercisedoing houseworkdoing	exercisedoing houseworkplaying	exercisedoing houseworktalking	exercisedoing remote	exerciseplaying music	exercisewatching reading	exercisewatching tv	familydoing remote	...	remote work	studyi didnt	studyplaying music	talking friends	tv ie	watching reading	watching tv	work study	work studyi	work studyplaying
0	0	0	0	0	0	0	0	1	0	1	...	1	0	0	0	0	0	0	1	0	0
1	0	0	0	0	0	0	0	0	1	0	...	1	0	0	0	1	0	0	1	0	0
2	0	0	0	0	0	0	0	0	1	0	...	1	0	0	0	1	0	0	1	0	0
3	0	0	0	0	0	0	0	1	0	0	...	1	0	0	0	0	0	0	1	0	0
4	0	0	0	0	0	0	0	0	1	0	...	0	0	0	0	1	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
97	0	0	0	0	0	0	0	0	0	1	...	1	0	0	0	0	1	0	1	0	0
98	0	0	0	0	0	0	0	1	0	0	...	0	0	0	0	0	0	0	0	0	0
99	0	0	0	0	0	0	0	0	0	1	...	1	0	0	0	1	0	1	1	0	0
100	0	0	0	0	0	0	0	0	0	0	...	1	0	1	0	1	0	1	0	0	1
101	0	0	0	0	0	0	0	0	0	1	...	1	0	1	1	0	0	0	0	0	1

	didnt	exercise	exercisedoing	exerciseplaying	exercisewatching	family	familydoing	familyplaying	friends	help	...	playing	reading	remote	study	studyi	studyplaying	talking	tv	watching	work
0	0.0	0.0	0.0	0.0	0.43	0.0	0.41	0.0	0.30	0.0	...	0.0	0.32	0.30	0.35	0.0	0.00	0.00	0.00	0.00	0.30
1	0.0	0.0	0.0	0.0	0.44	0.0	0.00	0.0	0.00	0.0	...	0.0	0.00	0.31	0.36	0.0	0.00	0.00	0.39	0.00	0.31
2	0.0	0.0	0.0	0.0	0.38	0.0	0.00	0.0	0.00	0.0	...	0.0	0.00	0.27	0.31	0.0	0.00	0.00	0.34	0.00	0.27
3	0.0	0.0	0.0	0.0	0.49	0.0	0.00	0.0	0.00	0.0	...	0.0	0.37	0.34	0.40	0.0	0.00	0.00	0.00	0.00	0.34
4	0.0	0.0	0.0	0.0	0.38	0.0	0.00	0.0	0.00	0.0	...	0.0	0.00	0.00	0.00	0.0	0.00	0.00	0.34	0.00	0.00
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
97	0.0	0.0	0.0	0.0	0.00	0.0	0.37	0.0	0.26	0.0	...	0.0	0.28	0.26	0.31	0.0	0.00	0.00	0.00	0.29	0.26
98	0.0	0.0	0.0	0.0	0.42	0.0	0.00	0.0	0.00	0.0	...	0.0	0.31	0.00	0.00	0.0	0.00	0.00	0.00	0.00	0.00
99	0.0	0.0	0.0	0.0	0.00	0.0	0.34	0.0	0.25	0.0	...	0.0	0.00	0.25	0.29	0.0	0.00	0.00	0.31	0.27	0.25
100	0.0	0.0	0.0	0.0	0.00	0.0	0.00	0.0	0.00	0.0	...	0.0	0.26	0.24	0.00	0.0	0.41	0.00	0.31	0.27	0.24
101	0.0	0.0	0.0	0.0	0.00	0.0	0.40	0.0	0.29	0.0	...	0.0	0.00	0.29	0.00	0.0	0.49	0.45	0.00	0.00	0.29

	activity	alcohol	appetite	body	changes	chronic	concentrating	decisions	decisionsdifficulty	decisionsphysical	...	rashes	rashesincreased	rashesworsening	reactions	skin	sleeping	stomach	tobacco	use	worsening
0	0	1	0	1	0	0	1	0	1	0	...	0	1	0	1	1	1	1	1	1	0
1	1	0	1	0	1	0	1	1	0	0	...	0	0	0	0	0	0	0	0	0	0
2	1	1	1	0	1	1	1	0	0	0	...	0	0	0	0	0	0	0	1	1	0
3	1	1	1	0	1	1	0	0	0	0	...	0	0	0	0	0	1	0	1	1	0
4	1	0	1	1	1	1	1	0	0	1	...	0	0	1	1	1	0	1	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
97	0	0	0	0	0	0	1	1	0	0	...	0	0	0	0	0	0	0	0	0	0
98	0	1	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	1	1	0
99	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
100	0	0	0	0	0	0	1	0	1	0	...	0	0	0	0	0	1	0	0	0	0
101	0	0	0	0	0	0	1	1	0	0	...	0	0	0	0	0	0	0	0	0	0

	activity levels	activity levelsdifficulty	activity levelsphysical	activity levelsworsening	alcohol tobacco	appetite energy	body pains	changes appetite	chronic health	concentrating making	...	skin rashesworsening	sleeping nightmares	sleeping nightmaresincreased	sleeping nightmaresnone	sleeping nightmaresphysical	sleeping nightmaresworsening	stomach problems	tobacco drugs	use alcohol	worsening chronic
0	0	0	0	0	1	0	1	0	0	1	...	0	0	0	0	1	0	1	1	1	0
1	0	1	0	0	0	1	0	1	0	1	...	0	0	0	0	0	0	0	0	0	0
2	0	1	0	0	1	1	0	1	1	1	...	0	0	0	0	0	0	0	1	1	0
3	0	1	0	0	1	1	0	1	1	0	...	0	0	0	0	0	1	0	1	1	0
4	0	1	0	0	0	1	1	1	1	1	...	1	0	0	0	0	0	1	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
97	0	0	0	0	0	0	0	0	0	1	...	0	0	0	0	0	0	0	0	0	0
98	0	0	0	0	1	0	0	0	0	0	...	0	0	0	0	0	0	0	1	1	0
99	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
100	0	0	0	0	0	0	0	0	0	1	...	0	0	0	1	0	0	0	0	0	0
101	0	0	0	0	0	0	0	0	0	1	...	0	0	0	0	0	0	0	0	0	0

	0	1	2	3	4	5	6	7	8	9	...	92	93	94	95	96	97	98	99	100	101
0	1.000000	0.100253	0.321276	0.313231	0.403088	0.100253	0.360272	0.599873	0.658482	0.0	...	0.000000	0.376253	0.376253	0.0	0.100253	0.234855	0.420867	0.0	0.317268	0.234855
1	0.100253	1.000000	0.475736	0.383974	0.453282	1.000000	0.270199	0.131595	0.000000	0.0	...	0.550987	0.192349	0.192349	0.0	1.000000	0.514250	0.000000	0.0	0.162195	0.514250
2	0.321276	0.475736	1.000000	0.801958	0.391156	0.475736	0.291541	0.421717	0.000000	0.0	...	0.312990	0.109264	0.109264	0.0	0.475736	0.153032	0.426401	0.0	0.092135	0.153032
3	0.313231	0.383974	0.801958	1.000000	0.344995	0.383974	0.299184	0.411157	0.056271	0.0	...	0.321196	0.080533	0.080533	0.0	0.383974	0.000000	0.437581	0.0	0.067908	0.000000
4	0.403088	0.453282	0.391156	0.344995	1.000000	0.453282	0.817433	0.071225	0.467645	0.0	...	0.298218	0.104108	0.104108	0.0	0.453282	0.145809	0.000000	0.0	0.087787	0.145809
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
97	0.234855	0.514250	0.153032	0.000000	0.145809	0.514250	0.000000	0.308278	0.112356	0.0	...	0.000000	0.450600	0.450600	0.0	0.514250	1.000000	0.000000	0.0	0.379960	1.000000
98	0.420867	0.000000	0.426401	0.437581	0.000000	0.000000	0.000000	0.552443	0.000000	0.0	...	0.000000	0.000000	0.000000	0.0	0.000000	0.000000	1.000000	0.0	0.000000	0.000000
99	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.0	...	0.000000	0.000000	0.000000	1.0	0.000000	0.000000	0.000000	1.0	0.000000	0.000000
100	0.317268	0.162195	0.092135	0.067908	0.087787	0.162195	0.000000	0.416456	0.155207	0.0	...	0.000000	0.608722	0.608722	0.0	0.162195	0.379960	0.000000	0.0	1.000000	0.379960
101	0.234855	0.514250	0.153032	0.000000	0.145809	0.514250	0.000000	0.308278	0.112356	0.0	...	0.000000	0.450600	0.450600	0.0	0.514250	1.000000	0.000000	0.0	0.379960	1.000000

The Impact of Covid 19 on Mental Health¶

Utilising Survey Data Collection, Data Visualisation, Univariate and Multivariate Analysis - Based on 3 Research Questions¶

Introduction and Objective¶

Role and Background¶

Sample and Population¶

Explicitly Stated Research Questions¶

Scope¶

Domain Concepts¶

Survey Design¶

Operational Definitions¶

Data¶

Justification of Good Data¶

Pre-Processing¶

Set Up Categories¶

Summary of Data Overview and Pre-Processing¶

Exploratory Data Analysis (EDA)¶

Frequency Distribution¶

Dichotomy Question. Q4 - Has the covid-19 lockdown affected your state of mind in any way? ¶

Q9 - How many hours per day are/ were you able to spend outside, on average during the lockdown? ¶

Checkbox Question. Q5 - Do you share a home with any of the following? ¶

Likert Scale Questions/ Key Variables¶

Q7. During the Covid 19 lockdown, please state the effects and restraints on your wellbeing:¶

7.1 How often did you feel unhappy or depressed?¶

Age Based¶

Gender Based¶

Q6. If you compare the pre-COVID-19 lockdown and the actual COVID-19 lockdown, how has:¶

6.1 Your amount of communication with family and friends changed?¶

Gender Based¶

Age Based¶

Q6. If you compare the pre-COVID-19 lockdown and the actual COVID-19 lockdown, how has:¶

6.2 Your financial status changed?¶

Gender Based¶

Q8. During the Covid 19 lockdown, please state the effects and restraints on your daily life:¶

8.1. How satisfied were you with your study and/ or work?¶

Gender Based¶

Age Based¶

Box Plots¶

Unhappy or Depressed and Age Variables¶

Univariate and Multivariate Analysis on Key Variables ¶

Checking Skewness and Kurtosis on Key Variables¶

Outliers¶

1. Computing Pearson Correlation Scores between features¶

Correlation Heatmap¶

Q12. Have you been tempted to access support from social groups during the covid-19 pandemic?¶

Q13 - Do you feel that the Covid 19 pandemic has affected your access to healthcare for other conditions?¶

Q11. During the Covid 19 lockdown, which of the following supported activities helped you feel better?¶

Q10. Select if you've had any of these psychological reactions towards the Covid 19 pandemic?¶

Conclusion and Discussion¶

Key Findings¶

Skewness¶

Outliers¶

Correlation Scores and Heatmap¶

Research Questions¶

Evaluation¶

References¶

Appendices¶

	didnt	exercise	exercisedoing	exerciseplaying	exercisewatching	family	familydoing	familyplaying	friends	help	...	playing	reading	remote	study	studyi	studyplaying	talking	tv	watching	work
0	0	0	0	0	1	0	1	0	1	0	...	0	1	1	1	0	0	0	0	0	1
1	0	0	0	0	1	0	0	0	0	0	...	0	0	1	1	0	0	0	1	0	1
2	0	0	0	0	1	0	0	0	0	0	...	0	0	1	1	0	0	0	1	0	1
3	0	0	0	0	1	0	0	0	0	0	...	0	1	1	1	0	0	0	0	0	1
4	0	0	0	0	1	0	0	0	0	0	...	0	0	0	0	0	0	0	1	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
97	0	0	0	0	0	0	1	0	1	0	...	0	1	1	1	0	0	0	0	1	1
98	0	0	0	0	1	0	0	0	0	0	...	0	1	0	0	0	0	0	0	0	0
99	0	0	0	0	0	0	1	0	1	0	...	0	0	1	1	0	0	0	1	1	1
100	0	0	0	0	0	0	0	0	0	0	...	0	1	1	0	0	1	0	1	1	1
101	0	0	0	0	0	0	1	0	1	0	...	0	0	1	0	0	1	1	0	0	1

	didnt need	exercisedoing housework	exercisedoing houseworkdoing	exercisedoing houseworkplaying	exercisedoing houseworktalking	exercisedoing remote	exerciseplaying music	exercisewatching reading	exercisewatching tv	familydoing remote	...	remote work	studyi didnt	studyplaying music	talking friends	tv ie	watching reading	watching tv	work study	work studyi	work studyplaying
0	0	0	0	0	0	0	0	1	0	1	...	1	0	0	0	0	0	0	1	0	0
1	0	0	0	0	0	0	0	0	1	0	...	1	0	0	0	1	0	0	1	0	0
2	0	0	0	0	0	0	0	0	1	0	...	1	0	0	0	1	0	0	1	0	0
3	0	0	0	0	0	0	0	1	0	0	...	1	0	0	0	0	0	0	1	0	0
4	0	0	0	0	0	0	0	0	1	0	...	0	0	0	0	1	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
97	0	0	0	0	0	0	0	0	0	1	...	1	0	0	0	0	1	0	1	0	0
98	0	0	0	0	0	0	0	1	0	0	...	0	0	0	0	0	0	0	0	0	0
99	0	0	0	0	0	0	0	0	0	1	...	1	0	0	0	1	0	1	1	0	0
100	0	0	0	0	0	0	0	0	0	0	...	1	0	1	0	1	0	1	0	0	1
101	0	0	0	0	0	0	0	0	0	1	...	1	0	1	1	0	0	0	0	0	1

	activity	alcohol	appetite	body	changes	chronic	concentrating	decisions	decisionsdifficulty	decisionsphysical	...	rashes	rashesincreased	rashesworsening	reactions	skin	sleeping	stomach	tobacco	use	worsening
0	0	1	0	1	0	0	1	0	1	0	...	0	1	0	1	1	1	1	1	1	0
1	1	0	1	0	1	0	1	1	0	0	...	0	0	0	0	0	0	0	0	0	0
2	1	1	1	0	1	1	1	0	0	0	...	0	0	0	0	0	0	0	1	1	0
3	1	1	1	0	1	1	0	0	0	0	...	0	0	0	0	0	1	0	1	1	0
4	1	0	1	1	1	1	1	0	0	1	...	0	0	1	1	1	0	1	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
97	0	0	0	0	0	0	1	1	0	0	...	0	0	0	0	0	0	0	0	0	0
98	0	1	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	1	1	0
99	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
100	0	0	0	0	0	0	1	0	1	0	...	0	0	0	0	0	1	0	0	0	0
101	0	0	0	0	0	0	1	1	0	0	...	0	0	0	0	0	0	0	0	0	0

	activity levels	activity levelsdifficulty	activity levelsphysical	activity levelsworsening	alcohol tobacco	appetite energy	body pains	changes appetite	chronic health	concentrating making	...	skin rashesworsening	sleeping nightmares	sleeping nightmaresincreased	sleeping nightmaresnone	sleeping nightmaresphysical	sleeping nightmaresworsening	stomach problems	tobacco drugs	use alcohol	worsening chronic
0	0	0	0	0	1	0	1	0	0	1	...	0	0	0	0	1	0	1	1	1	0
1	0	1	0	0	0	1	0	1	0	1	...	0	0	0	0	0	0	0	0	0	0
2	0	1	0	0	1	1	0	1	1	1	...	0	0	0	0	0	0	0	1	1	0
3	0	1	0	0	1	1	0	1	1	0	...	0	0	0	0	0	1	0	1	1	0
4	0	1	0	0	0	1	1	1	1	1	...	1	0	0	0	0	0	1	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
97	0	0	0	0	0	0	0	0	0	1	...	0	0	0	0	0	0	0	0	0	0
98	0	0	0	0	1	0	0	0	0	0	...	0	0	0	0	0	0	0	1	1	0
99	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
100	0	0	0	0	0	0	0	0	0	1	...	0	0	0	1	0	0	0	0	0	0
101	0	0	0	0	0	0	0	0	0	1	...	0	0	0	0	0	0	0	0	0	0