logo_ige
logo_uga

Machine Learning to predict location of ice recrystallization



May - July 2022
UGA and IGE internship
M1 Statistics and Data Sciences (SSD)

Renan MANCEAUX
Supervisor : Thomas CHAUVE

Dimensional Reduction


8.5. Principal Components Analysis (PCA) on TJ dataset#

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import sys
sys.path.append("../../scripts/")
import utils

from sklearn.decomposition import PCA
import plotly.express as px

8.5.1. Loading data#

TJ_CI02 = utils.load_tj_data("../../data/TJ/TJ_CI02.npy").dropna()
TJ_CI04 = utils.load_tj_data("../../data/TJ/TJ_CI04.npy").dropna()
TJ_CI06 = utils.load_tj_data("../../data/TJ/TJ_CI06.npy").dropna()
TJ_CI09 = utils.load_tj_data("../../data/TJ/TJ_CI09.npy").dropna()
TJ_CI21 = utils.load_tj_data("../../data/TJ/TJ_CI21.npy").dropna()
TJ_CI02['batch'] = ['CI02'] * np.shape(TJ_CI02)[0]
TJ_CI04['batch'] = ['CI04'] * np.shape(TJ_CI04)[0]
TJ_CI06['batch'] = ['CI06'] * np.shape(TJ_CI06)[0]
TJ_CI09['batch'] = ['CI09'] * np.shape(TJ_CI09)[0]
TJ_CI21['batch'] = ['CI21'] * np.shape(TJ_CI21)[0]

8.5.2. PCA#

8.5.2.1. Variables selection#

data = pd.concat((TJ_CI02,TJ_CI04,TJ_CI06,TJ_CI09,TJ_CI21))
data['RX'] = data['RX'].astype(object)
data['batch'] = data['batch'].astype(object)
X = data.loc[:,((data.columns != 'RX')&(data.columns != 'batch'))] 
y = data['RX']
b = data['batch']

8.5.2.2. Normalization and apply PCA#

norm_X = (X - X.mean())/X.std()
pca = PCA()
res_pca  = pca.fit(norm_X)

8.5.3. Results#

8.5.3.1. Eigen values / Explained variance ratio#

plt.bar(np.linspace(1,19,19),res_pca.explained_variance_ratio_,width=0.5)
plt.show()
../../_images/acp_tj_14_0.png

8.5.3.2. First Principal plan of individuals#

components = pca.fit_transform(norm_X)

PCDf = pd.DataFrame(data = components
             , columns = ['PC 1', 'PC 2', 'PC 3', 'PC 4', 'PC 5','PC 6', 'PC 7', 'PC 8', 'PC 9', 'PC 10', 'PC 11', 'PC 12', 'PC 13', 'PC 14', 'PC 15', 'PC 16', 'PC 17', 'PC 18', 'PC 19'])
fig = px.scatter(PCDf[["PC 1","PC 2"]], x='PC 1', y='PC 2', color=b)

fig.update_layout(
    xaxis_title=f"PC 1 ({(pca.explained_variance_ratio_[0] * 100):.1f}%)",
    yaxis_title=f"PC 2 ({(pca.explained_variance_ratio_[1] * 100):.1f}%)",
    width = 800,
    height = 800
)
fig.show()
fig = px.scatter(PCDf[["PC 1","PC 2"]], x='PC 1', y='PC 2', color=y)

fig.update_layout(
    xaxis_title=f"PC 1 ({(pca.explained_variance_ratio_[0] * 100):.1f}%)",
    yaxis_title=f"PC 2 ({(pca.explained_variance_ratio_[1] * 100):.1f}%)",
    width = 800,
    height = 800
)
fig.show()

8.5.3.3. Individuals on 1-4 components#

labels = {
    str(i): f"PC {i+1} ({var:.1f}%)"
    for i, var in enumerate(pca.explained_variance_ratio_ * 100)
}

fig = px.scatter_matrix(
    components,
    labels=labels,
    dimensions=range(4),
    color=b
)
fig.update_traces(diagonal_visible=False)

fig.update_layout(
    width = 700,
    height = 700
)
fig.show()

8.5.3.4. Density maps#

def filt(x,y, bins):
    x = np.array(x)
    y = np.array(y)

    d = np.digitize(x, bins)
    xfilt = []
    yfilt = []
    for i in np.unique(d):
        xi = x[d == i]
        yi = y[d == i]
        if len(xi) <= 2:
            xfilt.extend(list(xi))
            yfilt.extend(list(yi))
        else:
            xfilt.extend([xi[np.argmax(yi)], xi[np.argmin(yi)]])
            yfilt.extend([yi.max(), yi.min()])
    # prepend/append first/last point if necessary
    if x[0] != xfilt[0]:
        xfilt = [x[0]] + xfilt
        yfilt = [y[0]] + yfilt
    if x[-1] != xfilt[-1]:
        xfilt.append(x[-1])
        yfilt.append(y[-1])
    sort = np.argsort(xfilt)
    return np.array(xfilt)[sort], np.array(yfilt)[sort]
import warnings
warnings.filterwarnings('ignore')
df = PCDf[['PC 1','PC 2']]
df['batch'] = np.array(b)
df['RX'] = np.array(y)

CP_CI02 = df.where(df.batch=='CI02').dropna()
CP_CI04 = df.where(df.batch=='CI04').dropna()
CP_CI06 = df.where(df.batch=='CI06').dropna()
CP_CI09 = df.where(df.batch=='CI09').dropna()
CP_CI21 = df.where(df.batch=='CI21').dropna()
sns.set_style("white")
plt.figure(figsize=(20,10))
plt.subplot(231)

x = CP_CI02['PC 1']
y = CP_CI02['PC 2']
sns.kdeplot(x=x, y=y, cmap="Reds", shade=True)
plt.xlim([-5,13])
plt.ylim([-5,10])
plt.title("CI02")

plt.subplot(232)

x = CP_CI04['PC 1']
y = CP_CI04['PC 2']
sns.kdeplot(x=x, y=y, cmap="Blues", shade=True)
plt.xlim([-5,13])
plt.ylim([-5,10])
plt.title("CI04")

plt.subplot(233)

x = CP_CI06['PC 1']
y = CP_CI06['PC 2']
sns.kdeplot(x=x, y=y, cmap="Greens", shade=True)
plt.xlim([-5,13])
plt.ylim([-5,10])
plt.title("CI06")

plt.subplot(234)

x = CP_CI09['PC 1']
y = CP_CI09['PC 2']
sns.kdeplot(x=x, y=y, cmap="Oranges", shade=True)
plt.xlim([-5,13])
plt.ylim([-5,10])
plt.title("CI09")

plt.subplot(235)

x = CP_CI21['PC 1']
y = CP_CI21['PC 2']
sns.kdeplot(x=x, y=y, cmap="Greys", shade=True)
plt.xlim([-5,13])
plt.ylim([-5,10])
plt.title("CI21")

plt.show()
../../_images/acp_tj_25_0.png
CP_y0 = df.where(df.RX==0).dropna()
CP_y1 = df.where(df.RX==1).dropna()
sns.set_style("white")
plt.figure(figsize=(20,10))
plt.subplot(231)

x = CP_y0['PC 1']
y = CP_y0['PC 2']
sns.kdeplot(x=x, y=y, cmap="Blues", shade=True)
plt.xlim([-5,13])
plt.ylim([-5,10])
plt.title("RX = 0")

plt.subplot(232)

x = CP_y1['PC 1']
y = CP_y1['PC 2']
sns.kdeplot(x=x, y=y, cmap="Reds", shade=True)
plt.xlim([-5,13])
plt.ylim([-5,10])
plt.title("RX = 1")

plt.show()
../../_images/acp_tj_27_0.png

8.5.3.5. Correlation cirle on components 1-2 and 3-4#

n = np.shape(norm_X)[0]
p = np.shape(norm_X)[1]
eigval = (n-1)/n*pca.explained_variance_
sqrt_eigval = np.sqrt(eigval)
corvar = np.zeros((p,p))
for k in range(p):
    corvar[:,k] = pca.components_[k,:] * sqrt_eigval[k]
fig, axes = plt.subplots(figsize=(8,8))
axes.set_xlim(-1,1)
axes.set_ylim(-1,1)

for j in range(p):
    plt.annotate(X.columns[j],(corvar[j,0],corvar[j,1]))

plt.plot([-1,1],[0,0],color='silver',linestyle='-',linewidth=1)
plt.plot([0,0],[-1,1],color='silver',linestyle='-',linewidth=1)
cercle = plt.Circle((0,0),1,color='blue',fill=False)
axes.add_artist(cercle)

plt.show()
../../_images/acp_tj_30_0.png
fig, axes = plt.subplots(figsize=(8,8))
axes.set_xlim(-1,1)
axes.set_ylim(-1,1)

for j in range(p):
    plt.annotate(X.columns[j],(corvar[j,2],corvar[j,3]))

plt.plot([-1,1],[0,0],color='silver',linestyle='-',linewidth=1)
plt.plot([0,0],[-1,1],color='silver',linestyle='-',linewidth=1)
cercle = plt.Circle((0,0),1,color='blue',fill=False)
axes.add_artist(cercle)

plt.show()
../../_images/acp_tj_31_0.png