import numpy as np
import pandas as pd
import matplotlib.pyplot as pl
ds = pd.read_csv(“wine.csv”)
ds.head()
X = ds.iloc[:,0:13]
X.head()
y = ds.iloc[:,-1]
y.head()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2)
# We will extract any two columns as Principal Component
# With the help of EigenValue
# First we will standardize the data
from sklearn.preprocessing import StandardScaler
# fit and transform
X_train = st.fit_transform(X_train)
X_train
X_test = st.transform(X_test)
# Applying PCA
from sklearn.decomposition import PCA
# Check Eigenvalue of components
pca = PCA(n_components = 2)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
pca.explained_variance_ratio_
array([0.36138769, 0.1937306])
# Sorted ..... Descending order
# Now classification
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
LogisticRegression()
y_predict = logreg.predict(X_test)
y_predict
array([1, 3, 2, 3, 1, 1, 2, 2, 2, 1, 1, 3, 2, 3, 2, 1, 3, 3, 1, 1, 3, 2,
1, 1, 2, 1, 1, 2, 2, 3, 3, 1, 2, 3, 1, 2], dtype=int64)
# check with Confusion Metrix for Actual vs Predict
from sklearn.metrics import confusion_matrix
c = confusion_matrix(y_test, y_predict)
c
array([[13, 1, 0],
[1, 11, 1],
[0, 0, 9]], dtype=int64)
# 3 category
# Visualize 2 components (0th and 1st)
X_disp, y_disp = X_train, y_train
pl.scatter(X_disp[y_disp == 1,0], X_disp[y_disp == 1,1], label = ‘one’)
pl.scatter(X_disp[y_disp == 2,0], X_disp[y_disp == 2,1], label = ‘two’)
pl.scatter(X_disp[y_disp == 3,0], X_disp[y_disp == 3,1], label = ‘three’)
pl.legend()