'''mlModels uses a knn classifier and decision
trees to explore the UCI EEG Dataset'''
#bc ┌( ಠ_ಠ)┘@thirdBrainPrograms
import argparse
import logging
import matplotlib
import matplotlib.pyplot as plt
import os.path
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
#configure the logger
logger = logging.getLogger()
logger.setLevel(logging.DEBUG) #set the logging level
logFile = logging.FileHandler('eeg2.log', 'w') #create the log file
logFile.setLevel(logging.DEBUG) #set the logging level for file
logger.addHandler(logFile) #add handler to log at DEBUG level
#logger prints out a lot of (somewhat useful) information whenever a plot
#is generated by matplotlib.
#read in the csv with Fourier Transform already applied
def cleanDataAverage():
'''cleanDataAverage cleans, aggregates, and averages the data from each
individual absolute power trial. it also drop the 0 rows where
there is time frequency data missing and adds bi conditional
tag for ml modeling'''
#toggle for testing on small sets
df = pd.read_csv('EEG_powers.csv')
logging.debug('Raw absolute CSV loaded.')
#drop the first column
df.drop(df.columns[df.columns.str.contains('unnamed',case = False)],axis = 1,
inplace = True)
#drop the zeros from the dataframe
for i in df.columns:
df = df[df[i] != 0]
logging.debug('Zeros dropped from the analysis.')
#add the alcoholic and control column at end
df['status'] = (df['subject'].str.slice(start=3, stop=4) == "a").astype(int)
logging.debug('Conditional aspects configured.')
#aggregate the data
avgDf = df.groupby(df.subject).mean()
logging.debug('Data aggregated and trial signals averaged.')
#export this csv and log
avgDf.to_csv('EEG_averagedPowers.csv', index=True)
logging.debug('Trial Average Ready for ML Analysis.')
def cleanDataMedian():
'''cleanDataMedian cleans, aggregates, and computes the median from each
individual absolute power trial. it also drop the 0 rows where
there is time frequency data missing and adds bi conditional
tag for ml modeling'''
#toggle for testing on small sets
df = pd.read_csv('EEG_powers.csv')
logging.debug('Raw absolute CSV loaded.')
#drop the first column
df.drop(df.columns[df.columns.str.contains('unnamed',case = False)],axis = 1,
inplace = True)
#drop the zeros from the dataframe
for i in df.columns:
df = df[df[i] != 0]
logging.debug('Zeros dropped from the analysis.')
#add the alcoholic and control column at end
df['status'] = (df['subject'].str.slice(start=3, stop=4) == "a").astype(int)
logging.debug('Conditional aspects configured.')
#aggregate the data
avgDf = df.groupby(df.subject).median()
logging.debug('Data aggregated and trial signals median.')
#export this csv and log
avgDf.to_csv('EEG_medianPowers.csv', index=True)
logging.debug('Median Ready for ML Analysis.')
def knnEEG_1Mean():
'''knnEEG runs a k-nearest neighbor analysis on averaged EEG
data to differentiate between alcoholics and controls'''
#read in the data file with averaged powers
df = pd.read_csv('EEG_averagedPowers.csv')
#y = alcohol/no alcohol
y = df.status
#x = dataframe withoug the subject number or status identifier
X = df.drop(['subject', 'status'], axis=1)
# split into training and test sets of data
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.30, random_state=42, stratify=df.status)
#fit the knn classifier --> 1 neighbors
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)
#call prediction on the held out data set
y_pred = knn.predict(X_test)
print('Test set score for KNN 1-Neighbor: {:.2f}'.format(np.mean(y_pred == y_test)))
def knnEEG_3Mean():
'''knnEEG runs a k-nearest neighbor analysis on averaged EEG
data to differentiate between alcoholics and controls'''
#read in the data file with averaged powers
df = pd.read_csv('EEG_averagedPowers.csv')
#y = alcohol/no alcohol
y = df.status
#x = dataframe withoug the subject number or status identifier
X = df.drop(['subject', 'status'], axis=1)
# split into training and test sets of data
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.30, random_state=42, stratify=df.status)
#fit the knn classifier --> 1 neighbors
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
#call prediction on the held out data set
y_pred = knn.predict(X_test)
print('Test set score for KNN 3-Neighbors: {:.2f}'.format(np.mean(y_pred == y_test)))
def knnEEG_10Mean():
'''knnEEG runs a k-nearest neighbor analysis on averaged EEG
data to differentiate between alcoholics and controls'''
#read in the data file with averaged powers
df = pd.read_csv('EEG_averagedPowers.csv')
#y = alcohol/no alcohol
y = df.status
#x = dataframe withoug the subject number or status identifier
X = df.drop(['subject', 'status'], axis=1)
# split into training and test sets of data
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.30, random_state=42, stratify=df.status)
#fit the knn classifier --> 1 neighbors
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(X_train, y_train)
#call prediction on the held out data set
y_pred = knn.predict(X_test)
print('Test set score for KNN 10-Neighbors: {:.2f}'.format(np.mean(y_pred == y_test)))
def knn_eegPlotMean():
'''knn_eegPlot does a test/train analysis on the multi
dimensional EEG dataset'''
#read in the data file with averaged powers
df = pd.read_csv('EEG_averagedPowers.csv')
#y = alcohol/no alcohol
y = df.status
#x = dataframe withoug the subject number or status identifier
X = df.drop(['subject', 'status'], axis=1)
# split into training and test sets of data
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.30, random_state=42, stratify=df.status)
training_accuracy = []
test_accuracy = []
# set number of neighbors from 1 to 15
neighbors_settings = range(1, 16)
for n_neighbors in neighbors_settings:
# build and fit the model
clf = KNeighborsClassifier(n_neighbors=n_neighbors)
clf.fit(X_train, y_train)
# get training set accuracy
training_accuracy.append(clf.score(X_train, y_train))
# get test accuracy
test_accuracy.append(clf.score(X_test, y_test))
plt.plot(neighbors_settings, training_accuracy, label="Train Accuracy")
plt.plot(neighbors_settings, test_accuracy, label="Test Accuracy")
plt.ylabel("Accuracy")
plt.xlabel("n_neighbors")
plt.legend()
plt.show()
def eeg_treeMean():
'''eeg_tree builds a decision tree plotting feature importances
for alcohol vs. control EEG participants'''
df = pd.read_csv('EEG_averagedPowers.csv')
#y = alcohol/no alcohol
y = df.status
#x = dataframe withoug the subject number or status identifier
X = df.drop(['subject', 'status'], axis=1)
# split into training and test sets of data
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.30, random_state=42, stratify=df.status)
tree = DecisionTreeClassifier(max_depth=4, random_state=0)
tree.fit(X_train, y_train)
n_features = 60
plt.barh(range(n_features), tree.feature_importances_, align='center')
plt.yticks(np.arange(n_features), df.columns[1:-1])
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.ylim(-1, n_features)
plt.yticks(fontsize=10)
plt.show()
def knnEEG_1Median():
'''knnEEG runs a k-nearest neighbor analysis on median EEG
data to differentiate between alcoholics and controls'''
#read in the data file with averaged powers
df = pd.read_csv('EEG_medianPowers.csv')
#y = alcohol/no alcohol
y = df.status
#x = dataframe withoug the subject number or status identifier
X = df.drop(['subject', 'status'], axis=1)
# split into training and test sets of data
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.30, random_state=42, stratify=df.status)
#fit the knn classifier --> 1 neighbors
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)
#call prediction on the held out data set
y_pred = knn.predict(X_test)
print('Test set score for KNN 1-Neighbor: {:.2f}'.format(np.mean(y_pred == y_test)))
def knnEEG_3Median():
'''knnEEG runs a k-nearest neighbor analysis on median EEG
data to differentiate between alcoholics and controls'''
#read in the data file with averaged powers
df = pd.read_csv('EEG_medianPowers.csv')
#y = alcohol/no alcohol
y = df.status
#x = dataframe withoug the subject number or status identifier
X = df.drop(['subject', 'status'], axis=1)
# split into training and test sets of data
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.30, random_state=42, stratify=df.status)
#fit the knn classifier --> 1 neighbors
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
#call prediction on the held out data set
y_pred = knn.predict(X_test)
print('Test set score for KNN 3-Neighbors: {:.2f}'.format(np.mean(y_pred == y_test)))
def knnEEG_10Median():
'''knnEEG runs a k-nearest neighbor analysis on median EEG
data to differentiate between alcoholics and controls'''
#read in the data file with averaged powers
df = pd.read_csv('EEG_medianPowers.csv')
#y = alcohol/no alcohol
y = df.status
#x = dataframe withoug the subject number or status identifier
X = df.drop(['subject', 'status'], axis=1)
# split into training and test sets of data
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.30, random_state=42, stratify=df.status)
#fit the knn classifier --> 1 neighbors
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(X_train, y_train)
#call prediction on the held out data set
y_pred = knn.predict(X_test)
print('Test set score for KNN 10-Neighbors: {:.2f}'.format(np.mean(y_pred == y_test)))
def knn_eegPlotMedian():
'''knn_eegPlot does a test/train analysis on the multi
dimensional EEG dataset'''
#read in the data file with averaged powers
df = pd.read_csv('EEG_medianPowers.csv')
#y = alcohol/no alcohol
y = df.status
#x = dataframe withoug the subject number or status identifier
X = df.drop(['subject', 'status'], axis=1)
# split into training and test sets of data
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.30, random_state=42, stratify=df.status)
training_accuracy = []
test_accuracy = []
# set number of neighbors from 1 to 15
neighbors_settings = range(1, 16)
for n_neighbors in neighbors_settings:
# build and fit the model
clf = KNeighborsClassifier(n_neighbors=n_neighbors)
clf.fit(X_train, y_train)
# get training set accuracy
training_accuracy.append(clf.score(X_train, y_train))
# get test accuracy
test_accuracy.append(clf.score(X_test, y_test))
plt.plot(neighbors_settings, training_accuracy, label="Train Accuracy")
plt.plot(neighbors_settings, test_accuracy, label="Test Accuracy")
plt.ylabel("Accuracy")
plt.xlabel("n_neighbors")
plt.legend()
plt.show()
def eeg_treeMedian():
'''eeg_tree builds a decision tree plotting feature importances
for alcohol vs. control EEG participants'''
df = pd.read_csv('EEG_medianPowers.csv')
#y = alcohol/no alcohol
y = df.status
#x = dataframe withoug the subject number or status identifier
X = df.drop(['subject', 'status'], axis=1)
# split into training and test sets of data
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.30, random_state=42, stratify=df.status)
tree = DecisionTreeClassifier(max_depth=4, random_state=0)
tree.fit(X_train, y_train)
n_features = 60
plt.barh(range(n_features), tree.feature_importances_, align='center')
plt.yticks(np.arange(n_features), df.columns[1:-1])
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.ylim(-1, n_features)
plt.yticks(fontsize=10)
plt.show()
def stratifyBarPlot():
'''stratifyBarPlot show the numbers of alcoholic vs. control participants
in the actual study (before the model is trained)'''
#read in averaged dataframe
df = pd.read_csv('EEG_averagedPowers.csv')
#get status from last column in dataframes
Alcohol = len(df.status.loc[df.status ==1])
Control = len(df.status.loc[df.status == 0])
conditions = ['Alcohol', 'Control']
limits = [Alcohol, Control]
plt.bar(conditions, limits)
plt.ylabel('Number of Participants')
plt.title('Number of Alcoholic vs. Control Participants')
plt.show()
def StratifyPiePlot():
'''stratifyBarPlot show the ratio of alcoholic vs. control participants
in the actual study (before the model is trained)'''
#read in averaged dataframe
df = pd.read_csv('EEG_averagedPowers.csv')
#get status from last column in dataframes
Alcohol = len(df.status.loc[df.status ==1])
Control = len(df.status.loc[df.status == 0])
# Pie chart, where the slices will be ordered and plotted counter-clockwise:
labels = 'Alcoholic', 'Control'
sizes = [Alcohol, Control]
explode = (0, 0.1) # only "explode" the 1st slice (i.e. 'Alcoholic')
plt.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
shadow=True, startangle=90)
plt.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()
def main():
'''main parses user arguments to
determine how to present the data'''
#initialize to parse command line arguments
parse = argparse.ArgumentParser()
#add the optional arguments to present to the user
#nearest neighbor scores only take effect when 'print' is called / plots need 'graph'
parse.add_argument('command', metavar='<command>',
choices=['print', 'graph'], help='execute command')
parse.add_argument('--knn','-k', choices=['knn1Mean','knn3Mean',
'knn10Mean', 'knn1Median', 'knn3Median', 'knn10Median'],
help = 'Select number of neighbors')
parse.add_argument('--plot', '-p',
choices=['knnMean','treeMean','knnMedian','treeMedian', 'stratifyBar',
'stratifyPie'], help = 'Select plot (knn) or (tree)')
#check to see if cleaned average file already exists
if not os.path.exists('EEG_averagedPowers.csv'):
#if not create new averaged file
cleanDataAverage()
#check to see if cleaned median file already exists
if not os.path.exists('EEG_medianPowers.csv'):
#if not create new median file
cleanDataMedian()
#take in the arguments
args = parse.parse_args()
if args.command == 'print':
#knn1 --> mean
if args.knn == 'knn1Mean':
knnEEG_1Mean()
#knn3 --> mean
if args.knn == 'knn3Mean':
knnEEG_3Mean()
#10 --> mean
if args.knn == 'knn10Mean':
knnEEG_10Mean()
#knn1 --> median
if args.knn == 'knn1Median':
knnEEG_1Median()
#knn3 --> median
if args.knn == 'knn3Median':
knnEEG_3Median()
#knn10 --> median
if args.knn == 'knn10Median':
knnEEG_10Mean()
if args.command == 'graph':
#knn plot for neighbors 1-15 (mean)
if args.plot == 'knnMean':
knn_eegPlotMean()
#tree plot for feature importances (mean)
if args.plot == 'treeMean':
eeg_treeMean()
#knn plot for neighbors 1-15 (median)
if args.plot == 'knnMedian':
knn_eegPlotMedian()
#tree plot for feature importances (median)
if args.plot == 'treeMedian':
eeg_treeMedian()
if args.plot =='stratifyBar':
stratifyBarPlot()
if args.plot =='stratifyPie':
StratifyPiePlot()
if __name__ == "__main__":
main()