Sklearn Introduction
Published:
Introduction to Sklearn
1. Making of a proper dataset with 3 languages
# mount notebook to drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
Mounted at /content/drive
# change dir to data folder
# data link: English - https://drive.google.com/file/d/1-_Bb3PavAML6HqoUJkpyg-zOb1bYZgWe/view?usp=sharing
# German - https://drive.google.com/file/d/1-KljKd9thoKEKdNnf6zZKDvmff8_NKUi/view?usp=sharing
# Vietnamese - https://drive.google.com/file/d/1-7A26XYVi1sbr3tmvlEqnH8SKuzMYMfM/view?usp=sharing
import os
os.chdir("drive/MyDrive/UTC/Intro_ML_Project/Assignment1/")
#require libray
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import warnings
warnings.filterwarnings('ignore')
#function to create data
def generate_data(filename, k_letter=5):
#read files
if filename != 'german.txt':
f = open(filename)
list_words = f.readlines()
else:
df_w = pd.read_csv(filename, encoding="ISO-8859-1", header=None)
df_w = df_w.dropna()
list_words = df_w.iloc[:,0].values
#collect feature and label
feature_dataset = []
target_dataset = []
for word in list_words:
# Clean the line by removing the new-line character
if filename != 'german.txt':
cleaned_word = word.replace('\n', '')
else:
cleaned_word = word
# Check if the length of the cleaned word is equal to 5, to get words with 5 characters.
if len(cleaned_word) == k_letter:
# Make an array for converting word to ord representation
word_to_ord = []
# Iterate through the cleaned word characters, ord the character, and append it to the word_to_ord list.
for char in cleaned_word:
word_to_ord.append(ord(char))
# Append the ord'ed word to the training dataset
feature_dataset.append(word_to_ord)
# Append the correct answer to the target dataset
if filename == "english.txt":
target_dataset.append(0)
elif filename == "german.txt":
target_dataset.append(1)
else:
target_dataset.append(2)
return np.array(feature_dataset), np.array(target_dataset)
# #generate english data
en_feat, en_target = generate_data("english.txt", k_letter=5)
#generate german data
gm_feat, gm_target = generate_data("german.txt", k_letter=5)
# #generate vietnamese data
vn_feat, vn_target = generate_data("vietnamese.txt", k_letter=5)
#combine as one data
all_feat = np.concatenate([en_feat, gm_feat,vn_feat], axis=0)
all_target = np.concatenate([en_target, gm_target,vn_target], axis=0)
#check shape of dataset
print(en_feat.shape, gm_feat.shape, vn_feat.shape, all_feat.shape)
print(en_target.shape, gm_target.shape, vn_target.shape, all_target.shape)
(11435, 5) (3234, 5) (3025, 5) (17694, 5)
(11435,) (3234,) (3025,) (17694,)
2.Making a training and testing dataset split.
#split train test
X_train, X_test, y_train, y_test = train_test_split(all_feat, all_target, test_size=0.2, random_state=42, shuffle=True, stratify=all_target)
#check shape data after split
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
(14155, 5) (14155,) (3539, 5) (3539,)
3. Being able to train the models and make predictions.
#KNN model
knn = KNeighborsClassifier(n_neighbors=4, metric='euclidean') #chose 3 of neighbors, euclidean metrix
#training model
knn.fit(X_train, y_train)
#predict for test data
y_pred_knn = knn.predict(X_test)
#print accuracy on test_data
knn_acc = round(accuracy_score(y_test, y_pred_knn) * 100,1)
print("Accuracy of KNN on test data: ", knn_acc)
#summary classification report
print(classification_report(y_test, y_pred_knn))
Accuracy of KNN on test data: 89.7
precision recall f1-score support
0 0.88 0.98 0.93 2287
1 0.88 0.59 0.70 647
2 0.98 0.91 0.94 605
accuracy 0.90 3539
macro avg 0.91 0.83 0.86 3539
weighted avg 0.90 0.90 0.89 3539
#SVN model
sv = svm.SVC(C=5.0, kernel='rbf') #chose kernel rbf
#training model
sv.fit(X_train, y_train)
#predict for test data
y_pred_svm = sv.predict(X_test)
#print accuracy on test_data
sv_acc = round(accuracy_score(y_test, y_pred_svm)*100,1)
print("Accuracy of SVM on test data: ", sv_acc)
#summary classification report
print(classification_report(y_test, y_pred_svm))
Accuracy of SVM on test data: 79.5
precision recall f1-score support
0 0.77 1.00 0.87 2287
1 0.76 0.13 0.22 647
2 0.99 0.74 0.85 605
accuracy 0.80 3539
macro avg 0.84 0.62 0.64 3539
weighted avg 0.80 0.80 0.75 3539
#MLP model
mlp = MLPClassifier(
hidden_layer_sizes=(16,),
activation = 'logistic',
max_iter=100,
alpha=1e-4,
solver="sgd",
verbose=10,
random_state=42,
learning_rate_init=0.02,
) #chose kernel rbf
#training model
mlp.fit(X_train, y_train)
#predict for test data
y_pred_mlp = mlp.predict(X_test)
#print accuracy on test_data
mlp_acc = round(accuracy_score(y_test, y_pred_mlp)*100,1)
print("Accuracy of MLP on test data: ", mlp_acc)
#summary classification report
print(classification_report(y_test, y_pred_mlp))
Iteration 1, loss = 0.78381191
Iteration 2, loss = 0.57109453
Iteration 3, loss = 0.52196303
Iteration 4, loss = 0.47524718
Iteration 5, loss = 0.49233903
Iteration 6, loss = 0.44109614
Iteration 7, loss = 0.41557871
Iteration 8, loss = 0.44645181
Iteration 9, loss = 0.56176867
Iteration 10, loss = 0.55387634
Iteration 11, loss = 0.54611552
Iteration 12, loss = 0.54698724
Iteration 13, loss = 0.53984274
Iteration 14, loss = 0.53166853
Iteration 15, loss = 0.52611560
Iteration 16, loss = 0.52318489
Iteration 17, loss = 0.51651663
Iteration 18, loss = 0.51654785
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy of MLP on test data: 81.0
precision recall f1-score support
0 0.78 1.00 0.87 2287
1 0.94 0.12 0.22 647
2 0.98 0.83 0.90 605
accuracy 0.81 3539
macro avg 0.90 0.65 0.66 3539
weighted avg 0.84 0.81 0.76 3539
4. Graph the results of each model (KNN, SVM, MLP)
#install kaleido library for save result
!pip install -U kaleido -q
# Label text for model
labels = ["KNN", "SVM", "MLP"]
# model results
results = [knn_acc, sv_acc, mlp_acc]
#create dataframe
df_results = pd.DataFrame.from_dict({"models": labels, "results": results})
fig = go.Figure()
#plot model
fig.add_trace(go.Bar(
x=results,
y=labels,
marker=dict(
color='rgba(50, 171, 96, 0.6)',
line=dict(
color='rgba(50, 171, 96, 1.0)',
width=1),
),
orientation='h', text=results
))
#change postion of data label
fig.update_traces(textposition='outside')
# config charts
fig.update_layout(
title='Machine Learning Models Comparison',
xaxis_title="Accuracy",
yaxis_title="Models",
legend=dict(x=0.029, y=1.038, font_size=10),
margin=dict(l=100, r=20, t=70, b=70),
paper_bgcolor='rgb(248, 248, 255)',
plot_bgcolor='rgb(248, 248, 255)',
autosize=False,
width=800,
height=450,
)
# Display the graph
fig.show()
#save picture result
fig.write_image("models_result.png")
