Sklearn Introduction

5 minute read

Published: December 18, 2023

Introduction to Sklearn

1. Making of a proper dataset with 3 languages

# mount notebook to drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive

# change dir to data folder
# data link: English - https://drive.google.com/file/d/1-_Bb3PavAML6HqoUJkpyg-zOb1bYZgWe/view?usp=sharing
# German - https://drive.google.com/file/d/1-KljKd9thoKEKdNnf6zZKDvmff8_NKUi/view?usp=sharing
# Vietnamese - https://drive.google.com/file/d/1-7A26XYVi1sbr3tmvlEqnH8SKuzMYMfM/view?usp=sharing
import os
os.chdir("drive/MyDrive/UTC/Intro_ML_Project/Assignment1/")

#require libray
import pandas as pd 
import numpy as np
import sklearn 
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns 
import plotly.express as px
import plotly.graph_objects as go
import warnings
warnings.filterwarnings('ignore')

#function to create data

def generate_data(filename, k_letter=5):
    #read files
    if filename != 'german.txt':
        f = open(filename)
        list_words = f.readlines()
    else:
        df_w = pd.read_csv(filename, encoding="ISO-8859-1", header=None)
        df_w = df_w.dropna()
        list_words = df_w.iloc[:,0].values
    #collect feature and label
    feature_dataset = []
    target_dataset = []
    for word in list_words:
        # Clean the line by removing the new-line character
        if filename != 'german.txt':
            cleaned_word = word.replace('\n', '')
        else:
            cleaned_word = word
        # Check if the length of the cleaned word is equal to 5, to get words with 5 characters.
        if len(cleaned_word) == k_letter:
            # Make an array for converting word to ord representation
            word_to_ord = []
            
            # Iterate through the cleaned word characters, ord the character, and append it to the word_to_ord list.
            for char in cleaned_word:
                word_to_ord.append(ord(char))
                
            # Append the ord'ed word to the training dataset
            feature_dataset.append(word_to_ord)
            
            # Append the correct answer to the target dataset
            if filename == "english.txt":
                target_dataset.append(0)
            elif filename == "german.txt":
                target_dataset.append(1)
            else:
                target_dataset.append(2)
    return np.array(feature_dataset), np.array(target_dataset)

# #generate english data
en_feat, en_target = generate_data("english.txt", k_letter=5)
#generate german data
gm_feat, gm_target = generate_data("german.txt", k_letter=5)
# #generate vietnamese data
vn_feat, vn_target = generate_data("vietnamese.txt", k_letter=5)
#combine as one data
all_feat = np.concatenate([en_feat, gm_feat,vn_feat], axis=0)
all_target = np.concatenate([en_target, gm_target,vn_target], axis=0)

#check shape of dataset 
print(en_feat.shape, gm_feat.shape, vn_feat.shape, all_feat.shape)
print(en_target.shape, gm_target.shape, vn_target.shape, all_target.shape)

(11435, 5) (3234, 5) (3025, 5) (17694, 5)
(11435,) (3234,) (3025,) (17694,)

2.Making a training and testing dataset split.

#split train test
X_train, X_test, y_train, y_test = train_test_split(all_feat, all_target, test_size=0.2, random_state=42, shuffle=True, stratify=all_target)

#check shape data after split
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(14155, 5) (14155,) (3539, 5) (3539,)

3. Being able to train the models and make predictions.

#KNN model
knn = KNeighborsClassifier(n_neighbors=4, metric='euclidean') #chose 3 of neighbors, euclidean metrix 
#training model
knn.fit(X_train, y_train)
#predict for test data
y_pred_knn = knn.predict(X_test)
#print accuracy on test_data
knn_acc = round(accuracy_score(y_test, y_pred_knn) * 100,1) 
print("Accuracy of KNN on test data: ", knn_acc)
#summary classification report
print(classification_report(y_test, y_pred_knn))

Accuracy of KNN on test data:  89.7
              precision    recall  f1-score   support

           0       0.88      0.98      0.93      2287
           1       0.88      0.59      0.70       647
           2       0.98      0.91      0.94       605

    accuracy                           0.90      3539
   macro avg       0.91      0.83      0.86      3539
weighted avg       0.90      0.90      0.89      3539

#SVN model
sv = svm.SVC(C=5.0, kernel='rbf') #chose kernel rbf
#training model
sv.fit(X_train, y_train)
#predict for test data
y_pred_svm = sv.predict(X_test)
#print accuracy on test_data
sv_acc = round(accuracy_score(y_test, y_pred_svm)*100,1)
print("Accuracy of SVM on test data: ", sv_acc)
#summary classification report
print(classification_report(y_test, y_pred_svm))

Accuracy of SVM on test data:  79.5
              precision    recall  f1-score   support

           0       0.77      1.00      0.87      2287
           1       0.76      0.13      0.22       647
           2       0.99      0.74      0.85       605

    accuracy                           0.80      3539
   macro avg       0.84      0.62      0.64      3539
weighted avg       0.80      0.80      0.75      3539

#MLP model
mlp = MLPClassifier(
    hidden_layer_sizes=(16,),
    activation = 'logistic',
    max_iter=100,
    alpha=1e-4,
    solver="sgd",
    verbose=10,
    random_state=42,
    learning_rate_init=0.02,
)   #chose kernel rbf
#training model
mlp.fit(X_train, y_train)
#predict for test data
y_pred_mlp = mlp.predict(X_test)
#print accuracy on test_data
mlp_acc = round(accuracy_score(y_test, y_pred_mlp)*100,1)
print("Accuracy of MLP on test data: ", mlp_acc)
#summary classification report
print(classification_report(y_test, y_pred_mlp))

Iteration 1, loss = 0.78381191
Iteration 2, loss = 0.57109453
Iteration 3, loss = 0.52196303
Iteration 4, loss = 0.47524718
Iteration 5, loss = 0.49233903
Iteration 6, loss = 0.44109614
Iteration 7, loss = 0.41557871
Iteration 8, loss = 0.44645181
Iteration 9, loss = 0.56176867
Iteration 10, loss = 0.55387634
Iteration 11, loss = 0.54611552
Iteration 12, loss = 0.54698724
Iteration 13, loss = 0.53984274
Iteration 14, loss = 0.53166853
Iteration 15, loss = 0.52611560
Iteration 16, loss = 0.52318489
Iteration 17, loss = 0.51651663
Iteration 18, loss = 0.51654785
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy of MLP on test data:  81.0
              precision    recall  f1-score   support

           0       0.78      1.00      0.87      2287
           1       0.94      0.12      0.22       647
           2       0.98      0.83      0.90       605

    accuracy                           0.81      3539
   macro avg       0.90      0.65      0.66      3539
weighted avg       0.84      0.81      0.76      3539

4. Graph the results of each model (KNN, SVM, MLP)

#install kaleido library for save result
!pip install -U kaleido -q

# Label text for model
labels = ["KNN", "SVM", "MLP"]
# model results
results = [knn_acc, sv_acc, mlp_acc]
#create dataframe
df_results = pd.DataFrame.from_dict({"models": labels, "results": results})
fig = go.Figure()
#plot model 
fig.add_trace(go.Bar(
x=results,
y=labels,
marker=dict(
    color='rgba(50, 171, 96, 0.6)',
    line=dict(
        color='rgba(50, 171, 96, 1.0)',
        width=1),
),
orientation='h', text=results
))
#change postion of data label
fig.update_traces(textposition='outside')
# config charts
fig.update_layout(
    title='Machine Learning Models Comparison',
    xaxis_title="Accuracy",
    yaxis_title="Models",
    legend=dict(x=0.029, y=1.038, font_size=10),
    margin=dict(l=100, r=20, t=70, b=70),
    paper_bgcolor='rgb(248, 248, 255)',
    plot_bgcolor='rgb(248, 248, 255)',
    autosize=False,
    width=800,
    height=450,
)
# Display the graph
fig.show()
#save picture result
fig.write_image("models_result.png")

Share on

Twitter Facebook LinkedIn

Giang Do (James)

Sklearn Introduction

Introduction to Sklearn

1. Making of a proper dataset with 3 languages

2.Making a training and testing dataset split.

3. Being able to train the models and make predictions.

4. Graph the results of each model (KNN, SVM, MLP)

Share on

You May Also Enjoy

GAN Pytorch

LSTM Pytorch

Autoencoder Pytorch

MLP Pytorch