# importing libraries
import pandas as pd, numpy as np
import statistics

# to ignore any warning messages.
import warnings
warnings.filterwarnings('ignore')

# for visualization
import matplotlib
import matplotlib.pyplot as plt, seaborn as sb
import plotly.express as px
import wordcloud

# for unit testing
from unittest.mock import patch, Mock
import unittest

#for machine learning
import scipy
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


#for text preprocessing
import re
import nltk
import nltk.stem as Stemmer
import string
from nltk.corpus import stopwords
nltk.download('stopwords')
#Stop words present in the nltk library
stopwords = nltk.corpus.stopwords.words('english')
stopwords[0:10]
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

from nltk.tokenize import word_tokenize
#importing the Stemming function from nltk library

from nltk.stem.porter import PorterStemmer
from nltk.stem import porter
from nltk.stem import WordNetLemmatizer
Stemmer=porter.PorterStemmer()


import matplotlib.pyplot as plt
pd.options.display.max_colwidth = 200
%matplotlib inline
import os

#for language detection
#import langdetect

#for sentiment. only used when no training data
#from textblob import TextBlob

#for progress visualization
from tqdm import tqdm
tqdm.pandas()

#for vectorizer
from sklearn import feature_extraction, manifold

#for word embedding
import gensim.downloader as gensim_api

#for topic modeling
import gensim

#dealing with date feature
from datetime import datetime

# for error handling
from traceback import format_exc

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/stephenmalcolm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#first, check we're using the right file type/encoder i.e. it should be UTF-8
#this ensures, it's relatively clean, and can be read and organised (as opposed to some other unusable formats)
import pandas as pd
import csv
train = open('drugsComTrain_raw.csv', 'r')
test = open('drugsComTest_raw.csv', 'r')
train

<_io.TextIOWrapper name='drugsComTrain_raw.csv' mode='r' encoding='UTF-8'>


#we have the correct file encoder, as stated in the above output
#now, import the datasets and read the training set

import csv
train=pd.read_csv('drugsComTrain_raw.csv')#no sentiment lexicons required (manually curated wordlists), 
#as we have plenty of training data here - scraped from drug user review websites
test=pd.read_csv('drugsComTest_raw.csv')

train.head().append(train.tail())


#as both datasets contain the same columns, we can then combine them for efficient preprocessing and 
#better analysis

data = pd.concat([train, test])
data.head()


#Let's check the number of rows and columns
data.shape

(215063, 7)


#Let's check for missing values (Nan) 
#From above output, we have 7 columns
data.isnull().sum()

uniqueID          0
drugName          0
condition      1194
review            0
rating            0
date              0
usefulCount       0
dtype: int64


#lets create a data type function for displaying null values and data types [7]
def finding_null_value(data):

    total_null = data.isnull().sum()           

    total_percent = (data.isnull().sum()/data.isnull().count()*100) 
    
    
    new_var = pd.concat([total_null, total_percent], axis=1, keys=['Total_null', 'Total_percent(%)'])


    types_array = []
    for column in data.columns:
        dtype = str(data[column].dtype)
        types_array.append(dtype)
    new_var['Types'] = types_array

    return(np.transpose(new_var))


#The data type function displays:
# 1. Total null values 
# 2. Total percentage 
# 3. Display types of every feature (don't need to use Dtypes command, however, we'll still demonstrate below) 
finding_null_value(data)


# calculating the number of rows dropped due to null conditions
#'condition' is a critical feature and a string, hence we cannot replace with a mean or frequency
# we must then drop these rows where 'condition' is a null/NaN value
data.dropna(subset=["condition"], axis=0, inplace=True)


# reset index, because we droped 1194 rows
# resetting the index to avoid errors, if accessing rows by their indexes
data.reset_index(drop=True, inplace=True)
data


#let's check data types

#This gives us an extra step to check that there's no mistakes or no unexpected values in our data.

#This extra step is very useful for plotting as well, because if we plot an ordinal variable, Pandas and 
#Matplotlib will obey this natural ordering of the values, whereas if we didn't do that, the visualization would 
#tend to be sorted alphabetically, which can make things very confusing. This early stage of pre-processing 
#definitely gives us some benefits when it comes to visualization[7].

data.dtypes

uniqueID        int64
drugName       object
condition      object
review         object
rating          int64
date           object
usefulCount     int64
dtype: object


#let's first modify drugName, condition and review to category
#Set the nominal (non-ordered categorical) data types from object to category type

data['drugName']=data['drugName'].astype('category')
data['condition']=data['condition'].astype('category')
data['review']=data['review'].astype('category')
data.dtypes

uniqueID          int64
drugName       category
condition      category
review         category
rating            int64
date             object
usefulCount       int64
dtype: object


#then modify date object to string
data['date']=data['date'].astype('string')


#We can see below, the specified objects that were once object, have now been transformed into the correct category

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 213869 entries, 0 to 213868
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype   
---  ------       --------------   -----   
 0   uniqueID     213869 non-null  int64   
 1   drugName     213869 non-null  category
 2   condition    213869 non-null  category
 3   review       213869 non-null  category
 4   rating       213869 non-null  int64   
 5   date         213869 non-null  string  
 6   usefulCount  213869 non-null  int64   
dtypes: category(3), int64(3), string(1)
memory usage: 14.4 MB


#quick and dirty summary statistics

data.describe(include='all')


#perform deeper EDA
#preparing a separate DataFrame for analysis
data_explorer= data

# dropping some columns
data_explorer.drop(columns = ['usefulCount'], inplace = False)

# sorting rows in descending order of rating
data_explorer = data_explorer.sort_values('rating', ascending = False)

data_explorer.head()


#ensure all unique ID's reflect actual number of rows from above dataframe (not including missing values)
#this tells us we don't have duplicate patients (unique ID's)
#213,869 is the correct output

#check uniqueID
data_explorer['uniqueID'].nunique()

213869


#plot a bargraph to check top 10 drugnames

#from bargraph below, we can see that:
#Levonorgestrel - Around 4,800 patients are taking this medication, the most popular drug here
#Over 3000 patients are taking the top 3 drugs 

#The top 3 drugName has count around 4000 and above.
#Most of the drugName counts are around 1500 if we look at top 10

plt.figure(figsize=(12,6))
drug_top = data_explorer['drugName'].value_counts(ascending = False).head(10)

plt.bar(drug_top.index,drug_top.values,color='blue')
plt.title('Drug Names Top 10',fontsize = 20)
plt.xticks(rotation=90)
plt.ylabel('count')
plt.show()


#plot a bargraph to check top 10 conditions

#from bargraph below, we can see that:
#Birth control - Between 35,000 to 40,000 patients have birth control conditions, the most popular condition here
#Approximately 3-7 times more popular than any of the other conditions
#Most of the conditions for the top 10 are between 5000 - 10,000

plt.figure(figsize=(12,6))
cond_top = data_explorer['condition'].value_counts(ascending = False).head(10)

plt.bar(cond_top.index,cond_top.values,color='red')
plt.title('Conditions Top 10',fontsize = 20)
plt.xticks(rotation=90)
plt.ylabel('count')
plt.show()


#check counts for ratings
#from the output below, rating '10'/ top rating produces the majority of the counts

ratings_ = data_explorer['rating'].value_counts().sort_values(ascending=False).reset_index().\
                    rename(columns = {'index' :'rating', 'rating' : 'counts'})
ratings_['percent'] = 100 * (ratings_['counts']/data_explorer.shape[0])
print(ratings_)

   rating  counts    percent
0      10   67682  31.646475
1       9   36499  17.066054
2       1   28769  13.451692
3       8   24909  11.646849
4       7   12470   5.830672
5       5   10650   4.979684
6       2    9203   4.303101
7       3    8662   4.050143
8       6    8403   3.929041
9       4    6622   3.096288


#as a percentage, this top ten rating produces just over 30%, or approximately a third of the counts

sb.set(font_scale = 1.2, style = 'darkgrid')
plt.rcParams['figure.figsize'] = [12, 6]

#let's plot and check
sb.barplot(x = ratings_['rating'], y = ratings_['percent'],order = ratings_['rating'], palette='winter')
plt.title('Ratings Percent',fontsize=20)
plt.show()


#lets check the number of drugs per condition

#we can see from the output below, there's 219 drugs linked to treating pain
#however, there are 253 drugs (highest amount) linked to 'not listed/ other' conditions.  

#it may be possible that specific users didn't mention their condition, for privacy reasons. We can look up the 
#drug names and fill up the conditions for which that drug is used.

#there's possibly noise present in our dataset, possibly due to webscraping where the values are wrongly fed in here.

data_explorer.groupby('condition')['drugName'].nunique().sort_values(ascending=False).head(10)

condition
Not Listed / Othe       253
Pain                    219
Birth Control           181
High Blood Pressure     146
Acne                    127
Depression              115
Rheumatoid Arthritis    107
Diabetes, Type 2         97
Allergic Rhinitis        95
Insomnia                 85
Name: drugName, dtype: int64


#from the output below, we produced a countplot and a distribution plot of the ratings. so we can see the how the 
#ratings are distributed throughout the data

#the 'Distribution of Ratings' plot below, portrays that the ratings are skewed to the left (the left tail or 
#smaller values is much longer than the right tail or larger values), which infers that most 
#drugs received a relatively high score.  how accurate is this? in an ideal world, we prefer a normal distribution.
#this may indicate a lack of medical knowledge as to what a low score indicates, versus a what is meant by a high 
#score.  thus, the ‘Reviews’ column is perhaps a better indicator of sentiment... 

plt.rcParams['figure.figsize'] = [20,8]
sb.set(font_scale = 1.4, style = 'whitegrid')
fig, ax = plt.subplots(1, 2)

sns_1 = sb.countplot(data['rating'], palette = 'spring', order = list(range(10, 0, -1)), ax = ax[0])
sns_2 = sb.distplot(data['rating'], ax = ax[1])
sns_1.set_title('Count of Ratings')
sns_1.set_xlabel("Rating")

sns_2.set_title('Distribution of Ratings')
sns_2.set_xlabel("Rating");


#the barplot below shows the top 10 drugs with the '10/10' rating which basically shows us which drugs have received 
#majorly positive ratings and reviews

sb.set(font_scale = 1.2, style = 'whitegrid')
plt.rcParams['figure.figsize'] = [15, 8]

rating = dict(data.loc[data.rating == 10, "drugName"].value_counts())
drugname = list(rating.keys())
drug_rating = list(rating.values())

sns_rating = sb.barplot(x = drugname[0:10], y = drug_rating[0:10], palette = 'winter')

sns_rating.set(title = 'Top 10 Drugs with 10/10 Rating - Most Positive Ratings', ylabel = 'Number of Ratings', xlabel = "Drug Names")

plt.setp(sns_rating.get_xticklabels(), rotation=90);


#the barplot below shows the top 10 drugs with the '1/10' rating which basically shows us which drugs have received 
#majorly negative ratings and reviews

#analysing the top 10 drugs with both 1/10 and 10/10 ratings:
#3 drugs appear (Levonorgestral, Etongestrel and Ethyl estradiol/ Norethindrone) in both the bottom 10 and top 10 
#ratings.  Hence, there are mixed reviews on these 3 particular drugs. Which infers not all drug reviews are
#100% positive or negative

sb.set(font_scale = 1.2, style = 'whitegrid')
plt.rcParams['figure.figsize'] = [15, 8]

rating = dict(data.loc[data.rating == 1, "drugName"].value_counts())
drugname = list(rating.keys())
drug_rating = list(rating.values())

sns_rating = sb.barplot(x = drugname[0:10], y = drug_rating[0:10], palette = 'winter')

sns_rating.set(title = 'Top 10 Drugs with 1/10 Rating - Most Negative Ratings', ylabel = 'Number of Ratings', xlabel = "Drug Names")

plt.setp(sns_rating.get_xticklabels(), rotation=90);


#increases the quality of generated features
#includes synsets, a collection of synonymous words

import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/stephenmalcolm/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!

True


# function to preprocess a given text after applying several preprocessing functions

import re

wordnet_lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # lower case the text.It is one of the most common preprocessing steps where the text is converted into the same case preferably lower case
    text = text.lower()
    # Replacing the repeating pattern of &#039;
    text = text.replace("&#039;", "")
    # Removing all punctuation symboks from the text
    cleaned_text = "".join([i for i in text if i not in string.punctuation])
    # Removing all URLs from the text
    cleaned_text = re.sub(r'http\S+', '', cleaned_text)
    # Splitting the text into separate tokens. Each token is a word of the text.
    tokens = cleaned_text.split(" ")
    # All tokens that are also a stopword are filtered out as they are not really necessary for model building.
    filtered_tokens = [i for i in tokens if i not in stopwords]
    # Applying lemmatization to the tokens. It stems the word but makes sure that it does not lose its meaning.
    lemmatized_tokens = [wordnet_lemmatizer.lemmatize(word) for word in filtered_tokens]
    # Removing all symbols that are not alphanumeric. We will consider only alphanumeric symbols.
    sc_removed = [re.sub("[^a-zA-Z0-9]", ' ', token) for token in lemmatized_tokens]
    # Returning the preprocessed text
    return " ".join(sc_removed)


#preprocess_text function is optimized to run more efficiently, to produce 'clean_review' feature below

data['clean_review'] = data['review'].progress_apply(lambda x: preprocess_text(x))
data.head()

 60%|██████    | 128449/213869 [02:08<01:25, 999.44it/s]


#lets look at the feature 'rating', to see if the majority of the customer ratings are positive or negative 
#as a quick overview in the output below, the majority of the ratings are a '10' (highest rating)

#color=sb.color_palette()
#%matplotlib inline
#import plotly.offline as py
#py.init_notebook_mode(connected=True)
#import plotly.graph_objs as go
#import plotly.tools as tls
#import plotly.express as px

# Product Scores

#fig=px.histogram(data,x="rating")
#fig.update_traces(marker_color="turquoise", marker_line_color = 'rgb(8,48,107)', marker_line_width=1.5)
#fig.update_layout(title_text='Product Score')
#fig.show()


#create sentiment feature from ratings [17]

#if rating > 5 sentiment = 1 (positive)
#if rating < 5 sentiment = 0 (negative)

data['sentiment'] = data["rating"].apply(lambda x: 1 if x > 5 else 0)
data.head()


#we've now clasified ratings into positive and negative, 1 and 0 respectively

positive=data[data['sentiment']==1]
negative=data[data['sentiment']==0]


#next we will be using n-gram tokenization with n=2 to find out the most frequently occuring n-grams in the 
#review texts of people with both positive and negative reviews

##we explore n-grams, rather than words, so that we can consider word collocations

def count_ngrams(dataframe,column,begin_ngram,end_ngram): 
    word_vectorizer = CountVectorizer(ngram_range=(begin_ngram,end_ngram), analyzer='word')
    sparse_matrix = word_vectorizer.fit_transform(dataframe[column].dropna())
    frequencies = sum(sparse_matrix).data
    most_common = pd.DataFrame(frequencies, 
                               index=word_vectorizer.get_feature_names(), 
                               columns=['frequency']).sort_values('frequency',ascending=False)
    most_common['ngram'] = most_common.index
    most_common.reset_index()
    return most_common


#to limit memory consumption, we'll first randomly sample 20,000 negative reviews and perform n-gram tokenization
#please note that this is a memory intensive task and might take a lot of time to run.

#we can fine tune (increase n size) the n-gram tokenizer to optimize the accuracy of our models. However, 
#this is computatationally expensive. Hence, we shall use bigrams (as opposed to trigrams) in this instance. 
#we perform this, as two words together could represent more meaning, as oppposd to words on their own
#These bigrams could improve the prediction of positive or negative sentiment, over single word format

sample_df = negative.sample(20000)
two_grams = count_ngrams(sample_df,'clean_review', 2, 2)

fig = px.bar(two_grams.sort_values('frequency',ascending=False)[0:10].iloc[::-1], 
             x="frequency", 
             y="ngram",
             title='Most Common 2-gram words in negative reviews of people',
             orientation='h')
fig.show()


#we will do the same for positive reviews

#no real inferences were made with the bigrams
#we possibly require to fine tune our n-gram tokenizer with bigger n-grams and/ or randomly sample more reviews
#to arrive at a conclusion. as this is computationaly expensive, we'll stop here.

sample_df = positive.sample(20000)
two_grams = count_ngrams(sample_df,'clean_review', 2, 2)

fig = px.bar(two_grams.sort_values('frequency',ascending=False)[0:10].iloc[::-1], 
             x="frequency", 
             y="ngram",
             title='Most Common 2-gram words in postive reviews of people',
             orientation='h')
fig.show()


# a pie chart to represent the distribution of sentiments of the reviews posted
#our dataset is imbalanced because just less than 30% of our reviews are considered as negative ones. 
#this information will be very useful for the modelling part

size = [len(positive), len(negative)]
colors = ['lightblue', 'lightgreen']
labels = "Positive Sentiment","Negative Sentiment"
explode = [0, 0.1]

plt.rcParams['figure.figsize'] = (10, 10)
plt.pie(size, colors = colors, labels = labels, explode = explode, autopct = '%.2f%%')
plt.axis('off')
plt.title('Pie Chart Representation of Sentiments', fontsize = 25)
plt.legend()
plt.show()


# add number of characters column, for more exploration
data["nb_chars"] = data["clean_review"].apply(lambda x: len(x))

# add number of words column, for more exploration
data["nb_words"] = data["clean_review"].apply(lambda x: len(x.split(" ")))
data


# creating the word cloud for negative reviews
from wordcloud import WordCloud

# Creating the Word Cloud
final_wordcloud = WordCloud(width = 800, height = 800, 
                background_color ='black', 
                stopwords = stopwords, 
                min_font_size = 10).generate(' '.join(data[data['sentiment']==0]['clean_review']))


# Displaying the WordCloud                    
plt.figure(figsize = (10, 13), facecolor = None) 
plt.title('Word Cloud for Negative Reviews')
plt.imshow(final_wordcloud) 
plt.axis("off")
plt.tight_layout(pad = 0) 
  
plt.show()


# Creating the Word Cloud for positive reviews
final_wordcloud = WordCloud(width = 800, height = 800, 
                background_color ='black', 
                stopwords = stopwords, 
                min_font_size = 10).generate(' '.join(data[data['sentiment']==1]['clean_review']))


# Displaying the WordCloud                    
plt.figure(figsize = (10, 13), facecolor = None)
plt.title('Word Cloud for Positive Reviews')
plt.imshow(final_wordcloud) 
plt.axis("off")
plt.tight_layout(pad = 0) 
  
plt.show()


# Splitting the data that we have into train sets and test sets. We will use train set for training the models and
# test set to evaluate the models

X_train, X_test, y_train, y_test = train_test_split(data['clean_review'], data['sentiment'], test_size=0.33, 
                                                    random_state=42)


# For text classification using sentiment analysis, it can be achieved in many ways using a number of algorithms 
#[5]. We convert text into a bag of words model (sparse matrix of integers). We know we can't pass raw text 
#features in our model. hence, we have to convert them into numeric values.
#Bag of Words Model which will act as a baseline model when we use a Naive Bayes Classifier to train on it.

#CountVectorizer from sklearn package helps us build bag of words vectors so that we can use it for training and 
#evaluating later

bow = CountVectorizer(min_df=5, ngram_range=(1,1), max_features=5000)

try:
    train_features_bow = bow.fit_transform(X_train).toarray()
    test_features_bow = bow.transform(X_test).toarray()
except:
    raise Exception("Failed to vectorize data: ", format_exc())
    
train_features_bow.shape

(143292, 5000)


from sklearn.naive_bayes import MultinomialNB

try:
    baseline_model = MultinomialNB()
    baseline_model.fit(train_features_bow, y_train)
except:
    raise Exception("Failed to train model: ", format_exc())


preds = baseline_model.predict(test_features_bow)

print("Accuracy Score: ", accuracy_score(y_pred=preds, y_true=y_test))
print("Classification Report:\n ", classification_report(y_pred=preds, y_true=y_test))

Accuracy Score:  0.7831588194454284
Classification Report:
                precision    recall  f1-score   support

           0       0.63      0.67      0.65     21166
           1       0.85      0.83      0.84     49411

    accuracy                           0.78     70577
   macro avg       0.74      0.75      0.75     70577
weighted avg       0.79      0.78      0.78     70577


#the tf-idf algorithm below is designed in a way to extract features

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', ngram_range=(1,1), max_features=2000)

try:
    train_features_tfidf = tfidf.fit_transform(X_train).toarray()
    test_features_tfidf = tfidf.transform(X_test).toarray()
except:
    raise Exception("Failed to vectorize data: ", format_exc())
    
train_features_tfidf.shape

(143292, 2000)


# The model trainings might take a while to run...


from sklearn.linear_model import LogisticRegression

try:
    lr = LogisticRegression()
    lr.fit(train_features_tfidf, y_train)
except:
    raise Exception("Failed to train model: ", format_exc())


preds = lr.predict(test_features_tfidf)

print("Accuracy Score: ", accuracy_score(y_pred=preds, y_true=y_test))
print("Classification Report:\n ", classification_report(y_pred=preds, y_true=y_test))

Accuracy Score:  0.8277625855448659
Classification Report:
                precision    recall  f1-score   support

           0       0.76      0.62      0.68     21166
           1       0.85      0.92      0.88     49411

    accuracy                           0.83     70577
   macro avg       0.80      0.77      0.78     70577
weighted avg       0.82      0.83      0.82     70577


from sklearn.naive_bayes import MultinomialNB

try:
    mnb = MultinomialNB()
    mnb.fit(train_features_tfidf, y_train)
except:
    raise Exception("Failed to train model: ", format_exc())


preds = mnb.predict(test_features_tfidf)

print("Accuracy Score: ", accuracy_score(y_pred=preds, y_true=y_test))
print("Classification Report:\n ", classification_report(y_pred=preds, y_true=y_test))

Accuracy Score:  0.7713419385918926
Classification Report:
                precision    recall  f1-score   support

           0       0.83      0.30      0.44     21166
           1       0.76      0.97      0.86     49411

    accuracy                           0.77     70577
   macro avg       0.80      0.64      0.65     70577
weighted avg       0.78      0.77      0.73     70577


#gradient boosting model

from lightgbm import LGBMClassifier

try:
    lgb = LGBMClassifier()
    lgb.fit(train_features_tfidf, y_train,)
except:
    raise Exception("Failed to train model: ", format_exc())


preds = lgb.predict(test_features_tfidf)

print("Accuracy Score: ", accuracy_score(y_pred=preds, y_true=y_test))
print("Classification Report:\n ", classification_report(y_pred=preds, y_true=y_test))

Accuracy Score:  0.8142596029868088
Classification Report:
                precision    recall  f1-score   support

           0       0.78      0.53      0.63     21166
           1       0.82      0.93      0.88     49411

    accuracy                           0.81     70577
   macro avg       0.80      0.73      0.75     70577
weighted avg       0.81      0.81      0.80     70577


from sklearn.ensemble import RandomForestClassifier

try:
    RF = RandomForestClassifier(n_jobs=-1, max_depth=250)
    RF.fit(train_features_tfidf, y_train)
except:
    raise Exception("Failed to train model: ", format_exc())


preds = RF.predict(test_features_tfidf)
print("Accuracy Score: ", accuracy_score(y_pred=preds, y_true=y_test))
print("Classification Report:\n ", classification_report(y_pred=preds, y_true=y_test))

Accuracy Score:  0.9020502429970103
Classification Report:
                precision    recall  f1-score   support

           0       0.94      0.72      0.82     21166
           1       0.89      0.98      0.93     49411

    accuracy                           0.90     70577
   macro avg       0.91      0.85      0.87     70577
weighted avg       0.91      0.90      0.90     70577


import pandas as pd

# extract the most important feature names and Gini importance. then store in the dataframe
important_features = pd.DataFrame(list(zip(list(RF.feature_importances_), list(tfidf.get_feature_names()))), columns =['words score', 'words'])
# sort and print feartures
important_features.sort_values(important_features.columns[0], ascending = False)


from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.fit(train_features_tfidf, y_train)
def Show_most_informative_features(vectorizer, clf, n=20):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
      print ("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))
Show_most_informative_features(tfidf, mnb)

	-9.7176	useless        		-4.9927	day            
	-9.7112	ruined         		-5.0549	year           
	-9.7067	calling        		-5.1572	effect         
	-9.6045	waste          		-5.1749	side           
	-9.5957	omg            		-5.2266	work           
	-9.5799	ridiculous     		-5.2395	month          
	-9.5652	cyclen         		-5.2842	take           
	-9.5457	zap            		-5.3103	week           
	-9.5353	horrendous     		-5.3171	taking         
	-9.5309	asap           		-5.3327	pain           
	-9.5304	confused       		-5.3570	time           
	-9.5283	yi             		-5.3601	im             
	-9.5040	permanent      		-5.3756	ive            
	-9.4772	blurry         		-5.4324	first          
	-9.4738	confusion      		-5.4491	get            
	-9.4706	fire           		-5.4865	feel           
	-9.4417	junel          		-5.5102	started        
	-9.4346	star           		-5.5229	like           
	-9.4073	ovary          		-5.5355	medication     
	-9.4008	wall           		-5.5376	medicine

	uniqueID	drugName	condition	review	rating	date	usefulCount
0	206461	Valsartan	Left Ventricular Dysfunction	"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"	9	20-May-12	27
1	95260	Guanfacine	ADHD	"My son is halfway through his fourth week of Intuniv. We became concerned when he began this last week, when he started taking the highest dose he will be on. For two days, he could hardly get ou...	8	27-Apr-10	192
2	92703	Lybrel	Birth Control	"I used to take another oral contraceptive, which had 21 pill cycle, and was very happy- very light periods, max 5 days, no other side effects. But it contained hormone gestodene, which is not ava...	5	14-Dec-09	17
3	138000	Ortho Evra	Birth Control	"This is my first time using any form of birth control. I'm glad I went with the patch, I have been on it for 8 months. At first It decreased my libido but that subsided. The only downside is...	8	3-Nov-15	10
4	35696	Buprenorphine / naloxone	Opiate Dependence	"Suboxone has completely turned my life around. I feel healthier, I'm excelling at my job and I always have money in my pocket and my savings account. I had none of those before Suboxone an...	9	27-Nov-16	37
161292	191035	Campral	Alcohol Dependence	"I wrote my first report in Mid-October of 2014. I have not had any alcohol at all since that post, have reduced my dosage to 1 pill first thing in the morning. I am a very social person, and en...	10	31-May-15	125
161293	127085	Metoclopramide	Nausea/Vomiting	"I was given this in IV before surgey. I immediately became anxious and could not sit still . The PA said "oh yes this happens sometimes". They had to give me Benadryl to calm me down. I...	1	1-Nov-11	34
161294	187382	Orencia	Rheumatoid Arthritis	"Limited improvement after 4 months, developed bad rash and MD refused to continue medication."	2	15-Mar-14	35
161295	47128	Thyroid desiccated	Underactive Thyroid	"I've been on thyroid medication 49 years, I spent my first 38 on Synthroid and various t4 formulas, I could not stand the up and down of Synthroid, it never was steady. Every time I went to ...	10	19-Sep-15	79
161296	215220	Lubiprostone	Constipation, Chronic	"I've had chronic constipation all my adult life. Tried Linzess, it worked for a month then stopped. Doctor started me on Amitiza (24 mg) and it is a miracle. I've been on for four month...	9	13-Dec-14	116

	uniqueID	drugName	condition	review	rating	date	usefulCount
0	206461	Valsartan	Left Ventricular Dysfunction	"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"	9	20-May-12	27
1	95260	Guanfacine	ADHD	"My son is halfway through his fourth week of Intuniv. We became concerned when he began this last week, when he started taking the highest dose he will be on. For two days, he could hardly get ou...	8	27-Apr-10	192
2	92703	Lybrel	Birth Control	"I used to take another oral contraceptive, which had 21 pill cycle, and was very happy- very light periods, max 5 days, no other side effects. But it contained hormone gestodene, which is not ava...	5	14-Dec-09	17
3	138000	Ortho Evra	Birth Control	"This is my first time using any form of birth control. I'm glad I went with the patch, I have been on it for 8 months. At first It decreased my libido but that subsided. The only downside is...	8	3-Nov-15	10
4	35696	Buprenorphine / naloxone	Opiate Dependence	"Suboxone has completely turned my life around. I feel healthier, I'm excelling at my job and I always have money in my pocket and my savings account. I had none of those before Suboxone an...	9	27-Nov-16	37

	uniqueID	drugName	condition	review	rating	date	usefulCount
0	206461	Valsartan	Left Ventricular Dysfunction	"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"	9	20-May-12	27
1	95260	Guanfacine	ADHD	"My son is halfway through his fourth week of Intuniv. We became concerned when he began this last week, when he started taking the highest dose he will be on. For two days, he could hardly get ou...	8	27-Apr-10	192
2	92703	Lybrel	Birth Control	"I used to take another oral contraceptive, which had 21 pill cycle, and was very happy- very light periods, max 5 days, no other side effects. But it contained hormone gestodene, which is not ava...	5	14-Dec-09	17
3	138000	Ortho Evra	Birth Control	"This is my first time using any form of birth control. I'm glad I went with the patch, I have been on it for 8 months. At first It decreased my libido but that subsided. The only downside is...	8	3-Nov-15	10
4	35696	Buprenorphine / naloxone	Opiate Dependence	"Suboxone has completely turned my life around. I feel healthier, I'm excelling at my job and I always have money in my pocket and my savings account. I had none of those before Suboxone an...	9	27-Nov-16	37
...	...	...	...	...	...	...	...
213864	159999	Tamoxifen	Breast Cancer, Prevention	"I have taken Tamoxifen for 5 years. Side effects are severe sweating and depression. I have been taking Effexor XR longer than I have been on Tamoxifen. My Oncologist increased the Effexor dosage...	10	13-Sep-14	43
213865	140714	Escitalopram	Anxiety	"I've been taking Lexapro (escitaploprgram) since February. First, I'd like to mention that you can NOT take this drug for a week or less and expect to magically feel better; I felt real...	9	8-Oct-16	11
213866	130945	Levonorgestrel	Birth Control	"I'm married, 34 years old and I have no kids. Taking the pill was such a hassle so I decided to get the Mirena. It was very painful when it was inserted,then had cramping for the rest of tha...	8	15-Nov-10	7
213867	47656	Tapentadol	Pain	"I was prescribed Nucynta for severe neck/shoulder pain. After taking only 2, 75mg pills I was rushed to the ER with severe breathing problems. I have never had any issues with pain medicines befo...	1	28-Nov-11	20
213868	113712	Arthrotec	Sciatica	"It works!!!"	9	13-Sep-09	46

	uniqueID	drugName	condition	review	rating	date	usefulCount
106934	132527	Doxepin	Irritable Bowel Syndrome	"Had symptoms of IBS D for years, after a bout with a bad colon infection. Following many tests, medication trials that all failed, a great GI specialist suggested doxepin, it was a miracle. Try ...	10	23-May-16	12
55269	151385	Chantix	Smoking Cessation	"I'm 62 years old and had been smoking since I was 12. For financial and health reasons decided it was time to quit. With the help of Chanix, I was able to quit after 3 weeks on the medicatio...	10	18-Dec-16	23
55267	24899	Deplin	41</span> users found this comment helpful.	"I never responded to any medication for anxiety and depression, which was made worse by most medications. Deplin was a miracle for me. I started to use it this summer and my response was immediat...	10	7-Dec-10	41
136264	52152	Adipex-P	Obesity	"The first week I took Adipex I lost 10 pounds, it helped that I ran every morning but I was always edgy. I would work work work and had no desire to eat. Best diet pill so far. I've read tha...	10	2-Jun-11	44
136268	126034	Viibryd	Depression	"Giving it a 10 after only 3 days might sound odd but comparing to other meds I have been on, it's a 10. I was on Lexapro 20mg and Wellbutrin for a year and a half. Stopped the meds cold tu...	10	3-Oct-13	54

	uniqueID	drugName	condition	review	rating	date	usefulCount	clean_review
0	206461	Valsartan	Left Ventricular Dysfunction	"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"	9	20-May-12	27	side effect take combination bystolic 5 mg fish oil
1	95260	Guanfacine	ADHD	"My son is halfway through his fourth week of Intuniv. We became concerned when he began this last week, when he started taking the highest dose he will be on. For two days, he could hardly get ou...	8	27-Apr-10	192	son halfway fourth week intuniv became concerned began last week started taking highest dose two day could hardly get bed cranky slept nearly 8 hour drive home school vacation unusual called docto...
2	92703	Lybrel	Birth Control	"I used to take another oral contraceptive, which had 21 pill cycle, and was very happy- very light periods, max 5 days, no other side effects. But it contained hormone gestodene, which is not ava...	5	14-Dec-09	17	used take another oral contraceptive 21 pill cycle happy light period max 5 day side effect contained hormone gestodene available u switched lybrel ingredient similar pill ended started lybrel imm...
3	138000	Ortho Evra	Birth Control	"This is my first time using any form of birth control. I'm glad I went with the patch, I have been on it for 8 months. At first It decreased my libido but that subsided. The only downside is...	8	3-Nov-15	10	first time using form birth control im glad went patch 8 month first decreased libido subsided downside made period longer 56 day exact used period 34 day max also made cramp intense first two day...
4	35696	Buprenorphine / naloxone	Opiate Dependence	"Suboxone has completely turned my life around. I feel healthier, I'm excelling at my job and I always have money in my pocket and my savings account. I had none of those before Suboxone an...	9	27-Nov-16	37	suboxone completely turned life around feel healthier im excelling job always money pocket saving account none suboxone spent year abusing oxycontin paycheck already spent time got started reso...

Predicting Sentiment Based on Drug Product User Reviews (Using Real World Data) for Informed Decision Making - Part 1¶

Sentiment Analysis Using a Supervised Binary Text Classifier¶

1. Introduction¶

Objectives¶

Dataset¶

Evaluation Methodology¶

Implementation¶

Preprocessing¶

Baseline Performance¶

Classification Approach¶

Logistic Regression Model¶

Multinomial Naive Bayes Model¶

LightGBM Model¶

Random Forest Model¶

Futher Feature Extraction Techniques¶

Top features for positive and negative class¶

Conclusion¶

Evaluation¶

Summary and Conclusions¶

References¶

	uniqueID	drugName	condition	review	rating	date	usefulCount
Total_null	0	0	1194	0	0	0	0
Total_percent(%)	0	0	0.555186	0	0	0	0
Types	int64	object	object	object	int64	object	int64

	uniqueID	drugName	condition	review	rating	date	usefulCount
count	213869.000000	213869	213869	213869	213869.000000	213869	213869.000000
unique	NaN	3667	916	128449	NaN	3579	NaN
top	NaN	Levonorgestrel	Birth Control	"Good"	NaN	1-Mar-16	NaN
freq	NaN	4896	38436	39	NaN	185	NaN
mean	116076.924786	NaN	NaN	NaN	6.991149	NaN	28.094118
std	67016.705794	NaN	NaN	NaN	3.275792	NaN	36.401377
min	0.000000	NaN	NaN	NaN	1.000000	NaN	0.000000
25%	58122.000000	NaN	NaN	NaN	5.000000	NaN	6.000000
50%	115972.000000	NaN	NaN	NaN	8.000000	NaN	16.000000
75%	174018.000000	NaN	NaN	NaN	10.000000	NaN	36.000000
max	232291.000000	NaN	NaN	NaN	10.000000	NaN	1291.000000

	words score	words
1968	0.011501	worse
1983	0.010977	year
1970	0.009372	worst
853	0.008666	horrible
779	0.007952	great
...	...	...
386	0.000040	clonidine
1135	0.000039	moisturizer
1577	0.000039	skeptical
124	0.000039	adipex
273	0.000038	blessing