logo_ige
logo_uga

Machine Learning to predict location of ice recrystallization



May - July 2022
UGA and IGE internship
M1 Statistics and Data Sciences (SSD)

Renan MANCEAUX
Supervisor : Thomas CHAUVE

Dimensional Reduction


8.4. Principal Components Analysis (PCA) on pixel dataset (with anisotropy)#

Exploration process before apply machine learning to craft data. Linear reduction of dimensions based on “least square” algorithm and projection into easyer representativ space made by linear combinations of variables.

8.4.1. Import packages#

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import seaborn as sns


import sys
sys.path.append("../../scripts/")
import utils


from sklearn.decomposition import PCA

8.4.2. Loading data#

CI02 = utils.load_data("../../data/for_learning_plus/CI02.npy")
CI04 = utils.load_data("../../data/for_learning_plus/CI04.npy")
CI06 = utils.load_data("../../data/for_learning_plus/CI06.npy")
CI09 = utils.load_data("../../data/for_learning_plus/CI09.npy")
CI21 = utils.load_data("../../data/for_learning_plus/CI21.npy")

CI02['batch'] = ['CI02'] * np.shape(CI02)[0]
CI04['batch'] = ['CI04'] * np.shape(CI04)[0]
CI06['batch'] = ['CI06'] * np.shape(CI06)[0]
CI09['batch'] = ['CI09'] * np.shape(CI09)[0]
CI21['batch'] = ['CI21'] * np.shape(CI21)[0]

8.4.3. PCA with anisotropy factors#

8.4.3.1. Variables selection#

data = pd.concat((CI02,CI04,CI06,CI09,CI21))
data['Y'] = data['Y'].astype(int)
data['batch'] = data['batch'].astype(object)
X = data.loc[:,((data.columns != 'Y')&(data.columns != 'batch'))] 
y = data['Y']
b = data['batch']  

8.4.3.2. Normalization#

norm_X = (X - X.mean())/X.std()

8.4.3.3. Apply PCA#

pca = PCA()
res_pca  = pca.fit(norm_X)

8.4.4. Results#

8.4.4.1. Eigen values / Explained variance ratio#

plt.bar(np.linspace(1,14,14),res_pca.explained_variance_ratio_,width=0.5)
plt.show()
../../_images/acp_add_data_16_0.png

8.4.4.2. First Principal plan of individuals#

components = pca.fit_transform(norm_X)

PCDf = pd.DataFrame(data = components
             , columns = ['PC 1', 'PC 2', 'PC 3', 'PC 4', 'PC 5','PC 6', 'PC 7', 'PC 8', 'PC 9', 'PC 10', 'PC 11', 'PC 12', 'PC 13', 'PC 14'])

colors_lab = {1:'red', 0 :'blue'}

colors_batch = {'CI02':'red', 'CI04':'green', 'CI06':'blue', 'CI09':'yellow', 'CI21':'black'}
# plt.figure(figsize=(8,8))
# plt.scatter(components.T[0],components.T[1],alpha=0.3,color = b.map(colors_batch))
# plt.xlabel(f"PC 1 ({(pca.explained_variance_ratio_[0] * 100):.1f}%)")
# plt.ylabel(f"PC 2 ({(pca.explained_variance_ratio_[1] * 100):.1f}%)")
# plt.title('first principal plan colored by batch')
# plt.show()
# plt.figure(figsize=(8,8))
# plt.scatter(components.T[0],components.T[1],alpha=0.3,color = y.map(colors_lab))
# plt.xlabel(f"PC 1 ({(pca.explained_variance_ratio_[0] * 100):.1f}%)")
# plt.ylabel(f"PC 2 ({(pca.explained_variance_ratio_[1] * 100):.1f}%)")
# plt.title('first principal plan colored by label')
# plt.show()

8.4.4.3. Individuals on 1-4 components#

# plt.figure(figsize=(10,10))
# plt.subplot(221)
# plt.scatter(components.T[0],components.T[1],alpha=0.3,color = b.map(colors_batch))
# plt.xlabel(f"PC 1 ({(pca.explained_variance_ratio_[0] * 100):.1f}%)")
# plt.ylabel(f"PC 2 ({(pca.explained_variance_ratio_[1] * 100):.1f}%)")
# plt.title('first principal plane colored by batch')

# plt.subplot(222)
# plt.scatter(components.T[2],components.T[3],alpha=0.3,color = b.map(colors_batch))
# plt.xlabel(f"PC 3 ({(pca.explained_variance_ratio_[2] * 100):.1f}%)")
# plt.ylabel(f"PC 4 ({(pca.explained_variance_ratio_[3] * 100):.1f}%)")
# plt.title('second principal plane colored by batch')

# plt.subplot(223)
# plt.scatter(components.T[0],components.T[1],alpha=0.3,color = y.map(colors_lab))
# plt.xlabel(f"PC 1 ({(pca.explained_variance_ratio_[0] * 100):.1f}%)")
# plt.ylabel(f"PC 2 ({(pca.explained_variance_ratio_[1] * 100):.1f}%)")
# plt.title('first principal plane colored by label')

# plt.subplot(224)
# plt.scatter(components.T[2],components.T[3],alpha=0.3,color = y.map(colors_lab))
# plt.xlabel(f"PC 3 ({(pca.explained_variance_ratio_[2] * 100):.1f}%)")
# plt.ylabel(f"PC 4 ({(pca.explained_variance_ratio_[3] * 100):.1f}%)")
# plt.title('second principal plane colored by label')

# plt.show()

8.4.4.4. Density maps#

import warnings
warnings.filterwarnings('ignore')

df = PCDf[['PC 1','PC 2']]
df['batch'] = np.array(b)
df['RX'] = np.array(y)
CP_CI02 = df.where(df.batch=='CI02').dropna()
CP_CI04 = df.where(df.batch=='CI04').dropna()
CP_CI06 = df.where(df.batch=='CI06').dropna()
CP_CI09 = df.where(df.batch=='CI09').dropna()
CP_CI21 = df.where(df.batch=='CI21').dropna()
def filt(x,y, bins):
    x = np.array(x)
    y = np.array(y)

    d = np.digitize(x, bins)
    xfilt = []
    yfilt = []
    for i in np.unique(d):
        xi = x[d == i]
        yi = y[d == i]
        if len(xi) <= 2:
            xfilt.extend(list(xi))
            yfilt.extend(list(yi))
        else:
            xfilt.extend([xi[np.argmax(yi)], xi[np.argmin(yi)]])
            yfilt.extend([yi.max(), yi.min()])
    # prepend/append first/last point if necessary
    if x[0] != xfilt[0]:
        xfilt = [x[0]] + xfilt
        yfilt = [y[0]] + yfilt
    if x[-1] != xfilt[-1]:
        xfilt.append(x[-1])
        yfilt.append(y[-1])
    sort = np.argsort(xfilt)
    return np.array(xfilt)[sort], np.array(yfilt)[sort]
sns.set_style("white")
plt.figure(figsize=(20,10))
plt.subplot(231)

x = CP_CI02['PC 1']
y = CP_CI02['PC 2']
bins = np.linspace(x.min(),x.max(),301)
xf, yf = filt(x,y,bins) 
sns.kdeplot(x=xf, y=yf, cmap="Reds", shade=True)
plt.xlim([-15,30])
plt.ylim([-15,60])
plt.title("CI02")

plt.subplot(232)

x = CP_CI04['PC 1']
y = CP_CI04['PC 2']
bins = np.linspace(x.min(),x.max(),301)
xf, yf = filt(x,y,bins) 
sns.kdeplot(x=xf, y=yf, cmap="Blues", shade=True)
plt.xlim([-15,30])
plt.ylim([-15,60])
plt.title("CI04")

plt.subplot(233)

x = CP_CI06['PC 1']
y = CP_CI06['PC 2']
bins = np.linspace(x.min(),x.max(),301)
xf, yf = filt(x,y,bins) 
sns.kdeplot(x=xf, y=yf, cmap="Greens", shade=True)
plt.xlim([-15,30])
plt.ylim([-15,60])
plt.title("CI06")

plt.subplot(234)

x = CP_CI09['PC 1']
y = CP_CI09['PC 2']
bins = np.linspace(x.min(),x.max(),301)
xf, yf = filt(x,y,bins) 
sns.kdeplot(x=xf, y=yf, cmap="Oranges", shade=True)
plt.xlim([-15,30])
plt.ylim([-15,60])
plt.title("CI09")

plt.subplot(235)

x = CP_CI21['PC 1']
y = CP_CI21['PC 2']
bins = np.linspace(x.min(),x.max(),301)
xf, yf = filt(x,y,bins) 
sns.kdeplot(x=xf, y=yf, cmap="Greys", shade=True)
plt.xlim([-15,30])
plt.ylim([-15,60])
plt.title("CI21")

plt.show()
../../_images/acp_add_data_27_0.png
CP_y0 = df.where(df.RX==0).dropna()
CP_y1 = df.where(df.RX==1).dropna()
sns.set_style("white")
plt.figure(figsize=(20,10))
plt.subplot(231)

x = CP_y0['PC 1']
y = CP_y0['PC 2']
bins = np.linspace(x.min(),x.max(),301)
xf, yf = filt(x,y,bins) 
sns.kdeplot(x=xf, y=yf, cmap="Blues", shade=True)
plt.xlim([-15,30])
plt.ylim([-15,60])
plt.title("RX = 0")

plt.subplot(232)

x = CP_y1['PC 1']
y = CP_y1['PC 2']
bins = np.linspace(x.min(),x.max(),301)
xf, yf = filt(x,y,bins) 
sns.kdeplot(x=xf, y=yf, cmap="Reds", shade=True)
plt.xlim([-15,30])
plt.ylim([-15,60])
plt.title("RX = 1")

plt.show()
../../_images/acp_add_data_29_0.png

8.4.4.5. Correlation cirle on components 1-2, 3-4 and 5-6#

n = np.shape(norm_X)[0]
p = np.shape(norm_X)[1]
eigval = (n-1)/n*pca.explained_variance_
sqrt_eigval = np.sqrt(eigval)
corvar = np.zeros((p,p))
for k in range(p):
    corvar[:,k] = pca.components_[k,:] * sqrt_eigval[k]
fig, axes = plt.subplots(figsize=(8,8))
axes.set_xlim(-1,1)
axes.set_ylim(-1,1)

for j in range(p):
    plt.annotate(X.columns[j],(corvar[j,0],corvar[j,1]))

plt.plot([-1,1],[0,0],color='silver',linestyle='-',linewidth=1)
plt.plot([0,0],[-1,1],color='silver',linestyle='-',linewidth=1)
cercle = plt.Circle((0,0),1,color='blue',fill=False)
axes.add_artist(cercle)

plt.show()
../../_images/acp_add_data_32_0.png
fig, axes = plt.subplots(figsize=(8,8))
axes.set_xlim(-1,1)
axes.set_ylim(-1,1)

for j in range(p):
    plt.annotate(X.columns[j],(corvar[j,2],corvar[j,3]))

plt.plot([-1,1],[0,0],color='silver',linestyle='-',linewidth=1)
plt.plot([0,0],[-1,1],color='silver',linestyle='-',linewidth=1)
cercle = plt.Circle((0,0),1,color='blue',fill=False)
axes.add_artist(cercle)

plt.show()
../../_images/acp_add_data_33_0.png
fig, axes = plt.subplots(figsize=(8,8))
axes.set_xlim(-1,1)
axes.set_ylim(-1,1)

for j in range(p):
    plt.annotate(X.columns[j],(corvar[j,4],corvar[j,5]))

plt.plot([-1,1],[0,0],color='silver',linestyle='-',linewidth=1)
plt.plot([0,0],[-1,1],color='silver',linestyle='-',linewidth=1)
cercle = plt.Circle((0,0),1,color='blue',fill=False)
axes.add_artist(cercle)

plt.show()
../../_images/acp_add_data_34_0.png