logo_ige
logo_uga

Machine Learning to predict location of ice recrystallization



May - July 2022
UGA and IGE internship
M1 Statistics and Data Sciences (SSD)

Renan MANCEAUX
Supervisor : Thomas CHAUVE

Data exploration


8.7. Tresholding data for class balance#

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sys
sys.path.append("../../scripts/")
import utils

8.7.1. Loading data#

CI02 = utils.load_data("../../data/for_learning_plus/CI02.npy")
CI04 = utils.load_data("../../data/for_learning_plus/CI04.npy")
CI06 = utils.load_data("../../data/for_learning_plus/CI06.npy")
CI09 = utils.load_data("../../data/for_learning_plus/CI09.npy")
CI21 = utils.load_data("../../data/for_learning_plus/CI21.npy")

data = pd.concat((CI02,CI04,CI06,CI09,CI21))
data['Y'] = data['Y'].astype(object)

8.7.2. Description of variables#

plt.figure(figsize=(25,20))
c = 1
for i in data.columns[1:]:
    plt.subplot(4,4,c)
    #plt.boxplot((data.loc[(data.Y==1),i],data.loc[(data.Y==0),i]),vert=False,flierprops=dict(markerfacecolor='k', marker='.',linewidth=0.1),widths=0.5,positions=[1,0])
    plt.violinplot((data.loc[(data.Y==1),i],data.loc[(data.Y==0),i]),vert=False,positions=[1,0])
    plt.xlabel(i)
    plt.ylabel("RX_class")
    c += 1
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
Input In [3], in <cell line: 3>()
      4 plt.subplot(4,4,c)
      5 #plt.boxplot((data.loc[(data.Y==1),i],data.loc[(data.Y==0),i]),vert=False,flierprops=dict(markerfacecolor='k', marker='.',linewidth=0.1),widths=0.5,positions=[1,0])
----> 6 plt.violinplot((data.loc[(data.Y==1),i],data.loc[(data.Y==0),i]),vert=False,positions=[1,0])
      7 plt.xlabel(i)
      8 plt.ylabel("RX_class")

File /opt/conda/envs/exp_env/lib/python3.9/site-packages/matplotlib/pyplot.py:2999, in violinplot(dataset, positions, vert, widths, showmeans, showextrema, showmedians, quantiles, points, bw_method, data)
   2994 @_copy_docstring_and_deprecators(Axes.violinplot)
   2995 def violinplot(
   2996         dataset, positions=None, vert=True, widths=0.5,
   2997         showmeans=False, showextrema=True, showmedians=False,
   2998         quantiles=None, points=100, bw_method=None, *, data=None):
-> 2999     return gca().violinplot(
   3000         dataset, positions=positions, vert=vert, widths=widths,
   3001         showmeans=showmeans, showextrema=showextrema,
   3002         showmedians=showmedians, quantiles=quantiles, points=points,
   3003         bw_method=bw_method,
   3004         **({"data": data} if data is not None else {}))

File /opt/conda/envs/exp_env/lib/python3.9/site-packages/matplotlib/__init__.py:1412, in _preprocess_data.<locals>.inner(ax, data, *args, **kwargs)
   1409 @functools.wraps(func)
   1410 def inner(ax, *args, data=None, **kwargs):
   1411     if data is None:
-> 1412         return func(ax, *map(sanitize_sequence, args), **kwargs)
   1414     bound = new_sig.bind(ax, *args, **kwargs)
   1415     auto_label = (bound.arguments.get(label_namer)
   1416                   or bound.kwargs.get(label_namer))

File /opt/conda/envs/exp_env/lib/python3.9/site-packages/matplotlib/axes/_axes.py:7938, in Axes.violinplot(self, dataset, positions, vert, widths, showmeans, showextrema, showmedians, quantiles, points, bw_method)
   7935     kde = mlab.GaussianKDE(X, bw_method)
   7936     return kde.evaluate(coords)
-> 7938 vpstats = cbook.violin_stats(dataset, _kde_method, points=points,
   7939                              quantiles=quantiles)
   7940 return self.violin(vpstats, positions=positions, vert=vert,
   7941                    widths=widths, showmeans=showmeans,
   7942                    showextrema=showextrema, showmedians=showmedians)

File /opt/conda/envs/exp_env/lib/python3.9/site-packages/matplotlib/cbook/__init__.py:1453, in violin_stats(X, method, points, quantiles)
   1451 # Evaluate the kernel density estimate
   1452 coords = np.linspace(min_val, max_val, points)
-> 1453 stats['vals'] = method(x, coords)
   1454 stats['coords'] = coords
   1456 # Store additional statistics for this distribution

File /opt/conda/envs/exp_env/lib/python3.9/site-packages/matplotlib/axes/_axes.py:7936, in Axes.violinplot.<locals>._kde_method(X, coords)
   7934     return (X[0] == coords).astype(float)
   7935 kde = mlab.GaussianKDE(X, bw_method)
-> 7936 return kde.evaluate(coords)

File /opt/conda/envs/exp_env/lib/python3.9/site-packages/matplotlib/mlab.py:980, in GaussianKDE.evaluate(self, points)
    978         tdiff = np.dot(self.inv_cov, diff)
    979         energy = np.sum(diff * tdiff, axis=0) / 2.0
--> 980         result[i] = np.sum(np.exp(-energy), axis=0)
    982 result = result / self.norm_factor
    984 return result

KeyboardInterrupt: 

8.7.3. Threshold on dist2GB#

nb_crist = []
nb_ech = []
nb_tot = np.shape(data)[0]
x = list(range(70,0,-5))

nb_1 = len(data.loc[(data.Y==1),"Y"])
nb_0 = len(data.loc[(data.Y==0),"Y"])

for seuil in x :
    nb_crist.append(np.shape(data[(data["dist2GB"] < seuil ) & (data["Y"]==1)])[0])
    nb_ech.append(np.shape(data[(data["dist2GB"] < seuil)])[0])

pd.DataFrame(np.matrix((x,nb_ech,nb_crist,100*np.array(nb_crist)/np.array(nb_ech),-(1-(np.array(nb_crist)/nb_1))*100,-(1-((np.array(nb_ech)-np.array(nb_crist))/nb_0))*100)).T,columns=["dist2GB <","nb_sample","nb_crist","proportion of 1%)","1 lost (%)","0 lost (%)"])
dist2GB < nb_sample nb_crist proportion of 1%) 1 lost (%) 0 lost (%)
0 70.0 1476608.0 95789.0 6.487097 -0.000000 -0.089287
1 65.0 1474770.0 95789.0 6.495182 -0.000000 -0.222278
2 60.0 1471219.0 95789.0 6.510859 -0.000000 -0.479215
3 55.0 1464681.0 95789.0 6.539922 -0.000000 -0.952279
4 50.0 1454225.0 95789.0 6.586945 -0.000000 -1.708835
5 45.0 1438692.0 95789.0 6.658062 -0.000000 -2.832742
6 40.0 1415287.0 95744.0 6.764988 -0.046978 -4.522981
7 35.0 1379449.0 95592.0 6.929723 -0.205660 -7.105082
8 30.0 1322163.0 95243.0 7.203575 -0.570003 -11.224823
9 25.0 1236385.0 94332.0 7.629662 -1.521051 -17.365470
10 20.0 1118626.0 92653.0 8.282750 -3.273862 -25.764569
11 15.0 951528.0 89335.0 9.388583 -6.737726 -37.615055
12 10.0 719824.0 80231.0 11.145919 -16.241948 -53.721529
13 5.0 390033.0 52431.0 13.442709 -45.264070 -75.572427

8.7.4. Threshold on dist2TJ#

nb_crist = []
nb_ech = []
nb_tot = np.shape(data)[0]
x = list(range(70,0,-5))

nb_1 = len(data.loc[(data.Y==1),"Y"])
nb_0 = len(data.loc[(data.Y==0),"Y"])

for seuil in x :
    nb_crist.append(np.shape(data[(data["dist2TJ"] < seuil ) & (data["Y"]==1)])[0])
    nb_ech.append(np.shape(data[(data["dist2TJ"] < seuil)])[0])

pd.DataFrame(np.matrix((x,nb_ech,nb_crist,100*np.array(nb_crist)/np.array(nb_ech),-(1-(np.array(nb_crist)/nb_1))*100,-(1-((np.array(nb_ech)-np.array(nb_crist))/nb_0))*100)).T,columns=["dist2TJ <","nb_sample","nb_crist","proportion of 1%)","1 lost (%)","0 lost (%)"])
dist2TJ < nb_sample nb_crist proportion of 1%) 1 lost (%) 0 lost (%)
0 70.0 1436138.0 95236.0 6.631396 -0.577311 -2.977527
1 65.0 1415562.0 94981.0 6.709773 -0.843521 -4.447876
2 60.0 1387699.0 94689.0 6.823454 -1.148357 -6.442806
3 55.0 1350170.0 94367.0 6.989268 -1.484513 -9.134961
4 50.0 1300529.0 93704.0 7.205068 -2.176659 -12.678819
5 45.0 1235371.0 92661.0 7.500662 -3.265511 -17.317932
6 40.0 1149923.0 90834.0 7.899138 -5.172828 -23.368424
7 35.0 1043373.0 87755.0 8.410703 -8.387184 -30.855184
8 30.0 910662.0 83277.0 9.144666 -13.062043 -40.133627
9 25.0 755644.0 76645.0 10.143004 -19.985593 -50.870263
10 20.0 575385.0 66957.0 11.636904 -30.099490 -63.212120
11 15.0 383070.0 52139.0 13.610828 -45.568907 -76.055115
12 10.0 195100.0 31878.0 16.339313 -66.720605 -88.189889
13 5.0 55458.0 10869.0 19.598615 -88.653186 -96.773713

8.7.5. Subdatasets#

plt.figure(figsize=(7,5))
plt.plot(data.loc[(data.Y==0),"dist2GB"],data.loc[(data.Y==0),"dist2TJ"],'.',alpha=0.5)
plt.plot(data.loc[(data.Y==1),"dist2GB"],data.loc[(data.Y==1),"dist2TJ"],'.',alpha=0.5)
plt.legend(["FALSE",'TRUE'],title="RX")
plt.xlabel("Distance to GB")
plt.ylabel("Distance to TJ")
Text(0, 0.5, 'Distance to TJ')
../../_images/threshold_12_1.png
sub_data = data[(data["dist2GB"] < 30) & (data["dist2TJ"] < 60)]

nb_ech = np.shape(sub_data)[0]
nb_crist = np.shape(sub_data[sub_data["Y"]==1])[0]
nb_tot = np.shape(data)[0]
nb_tot_crist = np.shape(data[data["Y"]==1])[0]
prop = 100*np.array(nb_crist)/np.array(nb_ech)
print("taille du subdataset :",nb_ech)
print("nombre de pixels Y = 1 :",nb_crist)
print("perte de 0 :",((nb_ech-nb_crist)-(nb_tot-nb_tot_crist))/(nb_tot-nb_tot_crist)*100, "%")
print("perte de 1 :",(nb_crist-nb_tot_crist)/nb_tot_crist*100,"%")
print("proportion de 1 : ",prop,"%")
taille du subdataset : 1274623
nombre de pixels Y = 1 : 94143
perte de 0 : -14.585041239373597 %
perte de 1 : -1.7183601457369844 %
proportion de 1 :  7.385948629516335 %
plt.figure(figsize=(7,5))
plt.plot(data.loc[(data.Y==0),"dist2GB"],data.loc[(data.Y==0),"dist2TJ"],'.',alpha=0.5)
plt.plot(data.loc[(data.Y==1),"dist2GB"],data.loc[(data.Y==1),"dist2TJ"],'.',alpha=0.5)
plt.plot([0,30,30],[60,60,0],"--k")
plt.legend(["FALSE",'TRUE',"threshold"],title="RX")
plt.xlabel("Distance to GB")
plt.ylabel("Distance to TJ")
Text(0, 0.5, 'Distance to TJ')
../../_images/threshold_14_1.png
sub_data2 = data[(2*data["dist2GB"] + data["dist2TJ"] < 60)]

nb_ech = np.shape(sub_data2)[0]
nb_crist = np.shape(sub_data2[sub_data2["Y"]==1])[0]
nb_tot = np.shape(data)[0]
nb_tot_crist = np.shape(data[data["Y"]==1])[0]
prop = 100*np.array(nb_crist)/np.array(nb_ech)
print("taille du subdataset :",nb_ech)
print("nombre de pixels Y = 1 :",nb_crist)
print("perte de 0 :",((nb_ech-nb_crist)-(nb_tot-nb_tot_crist))/(nb_tot-nb_tot_crist)*100, "%")
print("perte de 1 :",(nb_crist-nb_tot_crist)/nb_tot_crist*100,"%")
print("proportion de 1 : ",prop,"%")
taille du subdataset : 934738
nombre de pixels Y = 1 : 88343
perte de 0 : -38.75813735073836 %
perte de 1 : -7.773335142866091 %
proportion de 1 :  9.451097526793605 %
plt.figure(figsize=(7,5))
plt.plot(data.loc[(data.Y==0),"dist2GB"],data.loc[(data.Y==0),"dist2TJ"],'.',alpha=0.5)
plt.plot(data.loc[(data.Y==1),"dist2GB"],data.loc[(data.Y==1),"dist2TJ"],'.',alpha=0.5)
plt.plot([0,30],[60,0],"--k")
plt.legend(["FALSE",'TRUE',"threshold"],title="RX")
plt.xlabel("Distance to GB")
plt.ylabel("Distance to TJ")
Text(0, 0.5, 'Distance to TJ')
../../_images/threshold_16_1.png
plt.figure(figsize=(25,20))
c = 1
for i in sub_data2 .columns[1:]:
    plt.subplot(4,4,c)
    #plt.boxplot((sub_data2.loc[(sub_data2.Y==1),i],sub_data2.loc[(sub_data2.Y==0),i]),vert=False,flierprops=dict(markerfacecolor='k', marker='.',linewidth=0.1),widths=0.5,positions=[1,0])
    plt.violinplot((sub_data2.loc[(sub_data2.Y==1),i],sub_data2.loc[(sub_data2.Y==0),i]),vert=False,positions=[1,0])
    plt.xlabel(i)
    plt.ylabel("RX_class")
    c += 1
../../_images/threshold_17_0.png