In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import random as rd
from pandas.plotting import scatter_matrix
import seaborn as sns
import missingno as msno
import warnings
from matplotlib import style
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
warnings.filterwarnings('ignore')
mpl.rcParams['figure.dpi'] = 100

!git clone https://github.com/andrewsihotang/tubesMLone.git
fatal: destination path 'tubesMLone' already exists and is not an empty directory.
In [ ]:
data1 = pd.read_csv('/content/tubesMLone/salju/salju_train.csv')
data1.shape
Out[ ]:
(109095, 24)
In [ ]:
data1.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 109095 entries, 0 to 109094
Data columns (total 24 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   id                        109095 non-null  int64  
 1   Tanggal                   109095 non-null  object 
 2   KodeLokasi                109095 non-null  object 
 3   SuhuMin                   107973 non-null  float64
 4   SuhuMax                   108166 non-null  float64
 5   Hujan                     106664 non-null  float64
 6   Penguapan                 62071 non-null   float64
 7   SinarMatahari             56716 non-null   float64
 8   ArahAnginTerkencang       101351 non-null  object 
 9   KecepatanAnginTerkencang  101399 non-null  float64
 10  ArahAngin9am              101172 non-null  object 
 11  ArahAngin3pm              105898 non-null  object 
 12  KecepatanAngin9am         107742 non-null  float64
 13  KecepatanAngin3pm         106792 non-null  float64
 14  Kelembaban9am             107093 non-null  float64
 15  Kelembaban3pm             105721 non-null  float64
 16  Tekanan9am                97768 non-null   float64
 17  Tekanan3pm                97787 non-null   float64
 18  Awan9am                   67251 non-null   float64
 19  Awan3pm                   64624 non-null   float64
 20  Suhu9am                   107755 non-null  float64
 21  Suhu3pm                   106397 non-null  float64
 22  BersaljuHariIni           106664 non-null  object 
 23  BersaljuBesok             106664 non-null  object 
dtypes: float64(16), int64(1), object(7)
memory usage: 20.0+ MB
In [ ]:
data1.dtypes
Out[ ]:
id                            int64
Tanggal                      object
KodeLokasi                   object
SuhuMin                     float64
SuhuMax                     float64
Hujan                       float64
Penguapan                   float64
SinarMatahari               float64
ArahAnginTerkencang          object
KecepatanAnginTerkencang    float64
ArahAngin9am                 object
ArahAngin3pm                 object
KecepatanAngin9am           float64
KecepatanAngin3pm           float64
Kelembaban9am               float64
Kelembaban3pm               float64
Tekanan9am                  float64
Tekanan3pm                  float64
Awan9am                     float64
Awan3pm                     float64
Suhu9am                     float64
Suhu3pm                     float64
BersaljuHariIni              object
BersaljuBesok                object
dtype: object
In [ ]:
data1.describe()
Out[ ]:
id SuhuMin SuhuMax Hujan Penguapan SinarMatahari KecepatanAnginTerkencang KecepatanAngin9am KecepatanAngin3pm Kelembaban9am Kelembaban3pm Tekanan9am Tekanan3pm Awan9am Awan3pm Suhu9am Suhu3pm
count 109095.000000 107973.000000 108166.000000 106664.000000 62071.000000 56716.000000 101399.000000 107742.000000 106792.000000 107093.000000 105721.000000 97768.000000 97787.000000 67251.000000 64624.000000 107755.000000 106397.000000
mean 54548.000000 12.196183 23.214819 2.385005 5.462440 7.599527 40.032002 14.052115 18.677579 68.895577 51.567626 1017.647080 1015.253117 4.450893 4.516140 16.991391 21.672771
std 31493.158146 6.389419 7.106596 8.588155 4.201638 3.789042 13.617554 8.926092 8.830199 18.995528 20.791573 7.117338 7.047875 2.884566 2.718738 6.477602 6.922833
min 1.000000 -8.500000 -4.800000 0.000000 0.000000 0.000000 7.000000 0.000000 0.000000 0.000000 0.000000 980.500000 977.100000 0.000000 0.000000 -7.200000 -5.400000
25% 27274.500000 7.600000 17.900000 0.000000 2.600000 4.800000 31.000000 7.000000 13.000000 57.000000 37.000000 1012.900000 1010.400000 1.000000 2.000000 12.300000 16.600000
50% 54548.000000 12.000000 22.600000 0.000000 4.800000 8.400000 39.000000 13.000000 19.000000 70.000000 52.000000 1017.600000 1015.200000 5.000000 5.000000 16.700000 21.100000
75% 81821.500000 16.800000 28.200000 0.800000 7.400000 10.600000 48.000000 19.000000 24.000000 83.000000 66.000000 1022.400000 1020.000000 7.000000 7.000000 21.600000 26.400000
max 109095.000000 33.900000 47.300000 371.000000 145.000000 14.300000 135.000000 130.000000 87.000000 100.000000 100.000000 1041.000000 1039.600000 9.000000 9.000000 40.200000 46.700000
In [ ]:
data1.head()
Out[ ]:
id Tanggal KodeLokasi SuhuMin SuhuMax Hujan Penguapan SinarMatahari ArahAnginTerkencang KecepatanAnginTerkencang ArahAngin9am ArahAngin3pm KecepatanAngin9am KecepatanAngin3pm Kelembaban9am Kelembaban3pm Tekanan9am Tekanan3pm Awan9am Awan3pm Suhu9am Suhu3pm BersaljuHariIni BersaljuBesok
0 1 01/06/2014 C4 10.4 15.5 4.8 NaN NaN WSW 24.0 NaN WSW 0.0 13.0 78.0 76.0 1020.1 1018.5 NaN NaN 13.1 15.0 Ya Tidak
1 2 15/07/2014 C10 9.0 17.0 8.0 2.6 7.4 NaN NaN SW WNW 13.0 20.0 80.0 61.0 1015.2 1014.6 7.0 5.0 11.9 15.5 Ya Ya
2 3 16/02/2011 C46 18.2 32.0 0.0 NaN NaN ESE 44.0 SE SE 15.0 26.0 62.0 42.0 NaN NaN NaN NaN 23.8 29.6 Tidak Tidak
3 4 08/08/2012 C36 7.3 24.5 0.0 8.4 10.4 SSW 54.0 N SW 13.0 19.0 25.0 17.0 1019.2 1016.9 1.0 7.0 15.3 23.2 Tidak Tidak
4 5 29/10/2016 C7 5.9 20.3 0.0 3.6 12.6 N 37.0 NNW ESE 22.0 19.0 55.0 48.0 1019.7 1014.7 2.0 6.0 12.4 18.1 Tidak Tidak

PREPROCESSING¶

Missing Value¶

In [ ]:
data1.isnull().sum()
Out[ ]:
id                              0
Tanggal                         0
KodeLokasi                      0
SuhuMin                      1122
SuhuMax                       929
Hujan                        2431
Penguapan                   47024
SinarMatahari               52379
ArahAnginTerkencang          7744
KecepatanAnginTerkencang     7696
ArahAngin9am                 7923
ArahAngin3pm                 3197
KecepatanAngin9am            1353
KecepatanAngin3pm            2303
Kelembaban9am                2002
Kelembaban3pm                3374
Tekanan9am                  11327
Tekanan3pm                  11308
Awan9am                     41844
Awan3pm                     44471
Suhu9am                      1340
Suhu3pm                      2698
BersaljuHariIni              2431
BersaljuBesok                2431
dtype: int64
In [ ]:
#cek missing value
total=data1.isnull().sum().sort_values(ascending = False)
print(total)
SinarMatahari               52379
Penguapan                   47024
Awan3pm                     44471
Awan9am                     41844
Tekanan9am                  11327
Tekanan3pm                  11308
ArahAngin9am                 7923
ArahAnginTerkencang          7744
KecepatanAnginTerkencang     7696
Kelembaban3pm                3374
ArahAngin3pm                 3197
Suhu3pm                      2698
Hujan                        2431
BersaljuBesok                2431
BersaljuHariIni              2431
KecepatanAngin3pm            2303
Kelembaban9am                2002
KecepatanAngin9am            1353
Suhu9am                      1340
SuhuMin                      1122
SuhuMax                       929
KodeLokasi                      0
Tanggal                         0
id                              0
dtype: int64
In [ ]:
data1.dropna(inplace=True)
data1.shape
Out[ ]:
(42411, 24)
In [ ]:
#remove data null
# data1 = data1.dropna(how='any',subset=['Penguapan','Hujan','SuhuMin','SuhuMax'])
data1.isnull().sum()
Out[ ]:
id                          0
Tanggal                     0
KodeLokasi                  0
SuhuMin                     0
SuhuMax                     0
Hujan                       0
Penguapan                   0
SinarMatahari               0
ArahAnginTerkencang         0
KecepatanAnginTerkencang    0
ArahAngin9am                0
ArahAngin3pm                0
KecepatanAngin9am           0
KecepatanAngin3pm           0
Kelembaban9am               0
Kelembaban3pm               0
Tekanan9am                  0
Tekanan3pm                  0
Awan9am                     0
Awan3pm                     0
Suhu9am                     0
Suhu3pm                     0
BersaljuHariIni             0
BersaljuBesok               0
dtype: int64

convert cal menjadi numerical agar dapat dilihat korelasinya

In [ ]:
cal = data1.dtypes==object
cal_col = data1.columns[cal].tolist()
data1[cal_col] = data1[cal_col].apply(lambda col: LabelEncoder().fit_transform(col))
data1[cal_col].head()
data1
Out[ ]:
id Tanggal KodeLokasi SuhuMin SuhuMax Hujan Penguapan SinarMatahari ArahAnginTerkencang KecepatanAnginTerkencang ArahAngin9am ArahAngin3pm KecepatanAngin9am KecepatanAngin3pm Kelembaban9am Kelembaban3pm Tekanan9am Tekanan3pm Awan9am Awan3pm Suhu9am Suhu3pm BersaljuHariIni BersaljuBesok
3 4 849 15 7.3 24.5 0.0 8.4 10.4 11 54.0 3 12 13.0 19.0 25.0 17.0 1019.2 1016.9 1.0 7.0 15.3 23.2 0 0
4 5 3188 23 5.9 20.3 0.0 3.6 12.6 3 37.0 6 2 22.0 19.0 55.0 48.0 1019.7 1014.7 2.0 6.0 12.4 18.1 0 0
5 6 1271 2 14.4 21.8 0.0 3.2 4.4 12 39.0 12 11 19.0 20.0 63.0 52.0 1016.1 1012.5 7.0 7.0 16.7 21.1 0 0
6 7 1380 15 7.7 18.7 0.2 5.6 9.7 14 46.0 7 14 19.0 28.0 69.0 31.0 1011.3 1008.8 1.0 1.0 11.3 18.3 0 0
8 9 1903 24 18.4 35.3 0.0 10.0 12.5 1 33.0 0 2 11.0 13.0 44.0 18.0 1017.9 1013.4 0.0 0.0 23.7 34.9 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
109080 109081 232 18 16.8 34.1 0.0 12.8 10.3 1 85.0 1 1 30.0 37.0 48.0 28.0 1013.4 1009.2 1.0 4.0 25.6 33.0 0 0
109082 109083 2396 4 8.7 19.0 0.0 1.4 9.6 13 24.0 13 9 22.0 11.0 81.0 59.0 1024.6 1022.3 2.0 2.0 10.8 16.5 0 0
109088 109089 1877 6 14.3 26.2 0.0 8.0 12.6 5 50.0 7 4 13.0 33.0 51.0 37.0 1019.2 1015.6 0.0 2.0 21.1 25.5 0 0
109090 109091 3309 17 20.1 23.7 0.0 7.2 8.9 2 43.0 9 2 24.0 26.0 74.0 70.0 1019.3 1017.6 4.0 6.0 22.0 22.1 0 1
109093 109094 1696 1 10.8 29.8 0.0 7.8 11.2 0 48.0 2 9 13.0 26.0 35.0 18.0 1020.0 1015.8 0.0 1.0 21.7 29.2 0 0

42411 rows × 24 columns

membandingkan keterkaitan

In [ ]:
plt.figure(figsize=(20,20))
sns.heatmap(data1.corr(),cmap='coolwarm',annot=True,linewidths=1);

selesksi beberapa atribut yang bernilai besar

In [ ]:
baru = ["BersaljuBesok", "SuhuMin", "SuhuMax", "Suhu9am", "Suhu3pm", "Penguapan", "SinarMatahari", "Awan9am", "Awan3pm", "Kelembaban9am", "Kelembaban3pm"]
newData = data1[baru]
newData.head()
newData
Out[ ]:
BersaljuBesok SuhuMin SuhuMax Suhu9am Suhu3pm Penguapan SinarMatahari Awan9am Awan3pm Kelembaban9am Kelembaban3pm
3 0 7.3 24.5 15.3 23.2 8.4 10.4 1.0 7.0 25.0 17.0
4 0 5.9 20.3 12.4 18.1 3.6 12.6 2.0 6.0 55.0 48.0
5 0 14.4 21.8 16.7 21.1 3.2 4.4 7.0 7.0 63.0 52.0
6 0 7.7 18.7 11.3 18.3 5.6 9.7 1.0 1.0 69.0 31.0
8 0 18.4 35.3 23.7 34.9 10.0 12.5 0.0 0.0 44.0 18.0
... ... ... ... ... ... ... ... ... ... ... ...
109080 0 16.8 34.1 25.6 33.0 12.8 10.3 1.0 4.0 48.0 28.0
109082 0 8.7 19.0 10.8 16.5 1.4 9.6 2.0 2.0 81.0 59.0
109088 0 14.3 26.2 21.1 25.5 8.0 12.6 0.0 2.0 51.0 37.0
109090 1 20.1 23.7 22.0 22.1 7.2 8.9 4.0 6.0 74.0 70.0
109093 0 10.8 29.8 21.7 29.2 7.8 11.2 0.0 1.0 35.0 18.0

42411 rows × 11 columns

Outlier¶

In [ ]:
mpl.rcParams['figure.dpi'] = 100
plt.figure(figsize=(10,50))
f, axes = plt.subplots(1, 4)
sns.boxplot(y=newData["SuhuMin"],ax=axes[0]) #SuhuMin
sns.boxplot(y=newData["SuhuMax"],ax=axes[1]) #SuhuMax
sns.boxplot(y=newData["Suhu9am"],ax=axes[2]) #Suhu9am
sns.boxplot(y=newData["Suhu3pm"],ax=axes[3]) #Suhu3pm
plt.subplots_adjust(wspace=8)
<Figure size 1000x5000 with 0 Axes>
In [ ]:
mpl.rcParams['figure.dpi'] = 100
plt.figure(figsize=(10,50))
f, axes = plt.subplots(1, 6)
sns.boxplot(y=newData["Penguapan"],ax=axes[0]) #Penguapan
sns.boxplot(y=newData["SinarMatahari"],ax=axes[1]) #SinarMatahari
sns.boxplot(y=newData["Awan9am"],ax=axes[2]) #Awan9am
sns.boxplot(y=newData["Awan3pm"],ax=axes[3]) #Awan3pm
sns.boxplot(y=newData["Kelembaban9am"],ax=axes[4]) #Kelembaban9am
sns.boxplot(y=newData["Kelembaban3pm"],ax=axes[5]) #Kelembaban3pm
plt.subplots_adjust(wspace=8)
<Figure size 1000x5000 with 0 Axes>

Data tergoglong outlier akan didrop (SuhuMin, SuhuMax, Suhu9am, Suhu3pm, Penguapan, dan Kelembaban9am)

In [ ]:
newData.drop(newData[newData.SuhuMin < -5].index,inplace=True) #SuhuMin
newData.drop(newData[newData.SuhuMax > 40].index,inplace=True) #SuhuMax
newData.drop(newData[newData.Suhu9am > 37].index,inplace=True) #Suhu9am
newData.drop(newData[newData.Suhu3pm > 41].index,inplace=True) #Suhu3pm
newData.drop(newData[newData.Penguapan > 13].index,inplace=True) #Penguapan
newData.drop(newData[newData.Kelembaban9am < 20].index,inplace=True) #Kelembaban9am
newData
Out[ ]:
BersaljuBesok SuhuMin SuhuMax Suhu9am Suhu3pm Penguapan SinarMatahari Awan9am Awan3pm Kelembaban9am Kelembaban3pm
3 0 7.3 24.5 15.3 23.2 8.4 10.4 1.0 7.0 25.0 17.0
4 0 5.9 20.3 12.4 18.1 3.6 12.6 2.0 6.0 55.0 48.0
5 0 14.4 21.8 16.7 21.1 3.2 4.4 7.0 7.0 63.0 52.0
6 0 7.7 18.7 11.3 18.3 5.6 9.7 1.0 1.0 69.0 31.0
8 0 18.4 35.3 23.7 34.9 10.0 12.5 0.0 0.0 44.0 18.0
... ... ... ... ... ... ... ... ... ... ... ...
109080 0 16.8 34.1 25.6 33.0 12.8 10.3 1.0 4.0 48.0 28.0
109082 0 8.7 19.0 10.8 16.5 1.4 9.6 2.0 2.0 81.0 59.0
109088 0 14.3 26.2 21.1 25.5 8.0 12.6 0.0 2.0 51.0 37.0
109090 1 20.1 23.7 22.0 22.1 7.2 8.9 4.0 6.0 74.0 70.0
109093 0 10.8 29.8 21.7 29.2 7.8 11.2 0.0 1.0 35.0 18.0

40421 rows × 11 columns

simpan dan export ke file bernama Nscaled_salju_train.csv

In [ ]:
from google.colab import files

newData.to_csv('Nscaled_salju_train.csv')
files.download('Nscaled_salju_train.csv')
In [ ]:
# !pip install --upgrade gupload

# from pydrive.auth import GoogleAuth
# from google.colab import auth

# # Authenticate and create the PyDrive client
# auth.authenticate_user()

# !gupload --to '13mwhSIIJAgHy-zx_E1bomFCH-0TcBImt' Nscaled_salju_train.csv

Menyeragamkan scaling pada tiap atribut

In [ ]:
scalling=MinMaxScaler()
sclr=scalling.fit_transform(newData)
colNew=["BersaljuBesok", "SuhuMin", "SuhuMax", "Suhu9am", "Suhu3pm", "Penguapan","SinarMatahari", "Awan9am", "Awan3pm", "Kelembaban9am", "Kelembaban3pm"]
scalled=pd.DataFrame(sclr,columns=colNew)
scalled
Out[ ]:
BersaljuBesok SuhuMin SuhuMax Suhu9am Suhu3pm Penguapan SinarMatahari Awan9am Awan3pm Kelembaban9am Kelembaban3pm
0 0.0 0.352601 0.540059 0.431267 0.535411 0.646154 0.727273 0.125 0.777778 0.0625 0.161616
1 0.0 0.312139 0.415430 0.353100 0.390935 0.276923 0.881119 0.250 0.666667 0.4375 0.474747
2 0.0 0.557803 0.459941 0.469003 0.475921 0.246154 0.307692 0.875 0.777778 0.5375 0.515152
3 0.0 0.364162 0.367953 0.323450 0.396601 0.430769 0.678322 0.125 0.111111 0.6125 0.303030
4 0.0 0.673410 0.860534 0.657682 0.866856 0.769231 0.874126 0.000 0.000000 0.3000 0.171717
... ... ... ... ... ... ... ... ... ... ... ...
40416 0.0 0.627168 0.824926 0.708895 0.813031 0.984615 0.720280 0.125 0.444444 0.3500 0.272727
40417 0.0 0.393064 0.376855 0.309973 0.345609 0.107692 0.671329 0.250 0.222222 0.7625 0.585859
40418 0.0 0.554913 0.590504 0.587601 0.600567 0.615385 0.881119 0.000 0.222222 0.3875 0.363636
40419 1.0 0.722543 0.516320 0.611860 0.504249 0.553846 0.622378 0.500 0.666667 0.6750 0.696970
40420 0.0 0.453757 0.697329 0.603774 0.705382 0.600000 0.783217 0.000 0.111111 0.1875 0.171717

40421 rows × 11 columns

simpan dan export ke file bernama scaled_salju_train.csv

In [ ]:
from google.colab import files

scalled.to_csv('scaled_salju_train.csv')
files.download('scaled_salju_train.csv')

cek korelasi kembali

In [ ]:
plt.figure(figsize=(10,10))
sns.heatmap(scalled.corr(),cmap='coolwarm',annot=True,linewidths=1);

Memilih atribut Awan3pm dan Kelembaban3pm karena memiliki nilai korelasi tertinggi terhadap atribut BersaljuBesok

In [ ]:
scalled=pd.read_csv("scaled_salju_train.csv",usecols=["Awan3pm","Kelembaban3pm","BersaljuBesok"])
scalled
Out[ ]:
BersaljuBesok Awan3pm Kelembaban3pm
0 0.0 0.777778 0.161616
1 0.0 0.666667 0.474747
2 0.0 0.777778 0.515152
3 0.0 0.111111 0.303030
4 0.0 0.000000 0.171717
... ... ... ...
40416 0.0 0.444444 0.272727
40417 0.0 0.222222 0.585859
40418 0.0 0.222222 0.363636
40419 1.0 0.666667 0.696970
40420 0.0 0.111111 0.171717

40421 rows × 3 columns

In [ ]:
from google.colab import drive 
drive.mount('/content/drive',force_remount=True)
Mounted at /content/drive
In [ ]:
%cd ..
%cd /content/
/
/content
In [ ]:
scalled=pd.read_csv("scaled_salju_train.csv",usecols=["Awan3pm","Kelembaban3pm","BersaljuBesok"])
scalled
Out[ ]:
BersaljuBesok Awan3pm Kelembaban3pm
0 0.0 0.777778 0.161616
1 0.0 0.666667 0.474747
2 0.0 0.777778 0.515152
3 0.0 0.111111 0.303030
4 0.0 0.000000 0.171717
... ... ... ...
40416 0.0 0.444444 0.272727
40417 0.0 0.222222 0.585859
40418 0.0 0.222222 0.363636
40419 1.0 0.666667 0.696970
40420 0.0 0.111111 0.171717

40421 rows × 3 columns

In [ ]:
Nscalled=pd.read_csv("Nscaled_salju_train.csv",usecols=["Awan3pm","Kelembaban3pm","BersaljuBesok"])
Nscalled
Out[ ]:
BersaljuBesok Awan3pm Kelembaban3pm
0 0 7.0 17.0
1 0 6.0 48.0
2 0 7.0 52.0
3 0 1.0 31.0
4 0 0.0 18.0
... ... ... ...
40416 0 4.0 28.0
40417 0 2.0 59.0
40418 0 2.0 37.0
40419 1 6.0 70.0
40420 0 1.0 18.0

40421 rows × 3 columns

CLUSTERING¶

proses elbow

In [ ]:
epsilon = list(range(5)) # Initialisation of epsilon
for k in range(1,6):
    cluster = pd.read_csv("scaled_salju_train.csv", usecols=["Awan3pm", "Kelembaban3pm"], nrows=20000) # Read data file into 'cluster'
    rows = cluster.shape[0] #contains the total number of rows in cluster data 'rows'
    cols = cluster.shape[1] #contains the total number of columns in cluster data 'cols'
    centroids = cluster.loc[np.random.randint(1,rows+1,k)] # Randomly initialises 'k' no. of centroids
    centroids['new'] = list(range(1,k+1)) # New indices 1 to k are set for the dataframe 'centroids'
    centroids.set_index('new',inplace = True) 
    d = np.random.rand(rows) # Initialization of 'd' which would contain the centroid number closest to data point
    number_of_iterations = 15
    tmp_eps = list(range(number_of_iterations)) # 'tmp_eps' is the sum of squares of distances between points and centroid of a cluster for each iteration
    for i in range(0,number_of_iterations): # loop is for iterations
          for j in range(0,rows):
              d[j] = ((centroids - cluster.loc[j])**2).sum(axis = 1).idxmin()
          cluster['centroid number'] = d #new column 'centroid number' is added to dataframe 'cluster'
          MX = list(range(k)) # Initialisation of 'MX' which will store mean of 'x' values of each cluster
          MY = list(range(k)) # Initialisation of 'MY' which will store mean of 'y' values of each cluster
          for m in range(0,k):
              MX[m] = cluster[cluster['centroid number'] == (m+1)]['Awan3pm'].mean()
              MY[m] = cluster[cluster['centroid number'] == (m+1)]['Kelembaban3pm'].mean()
          centroids.replace(list(centroids['Awan3pm']),MX,inplace = True) # The 'centroids' are replaced with the new values
          centroids.replace(list(centroids['Kelembaban3pm']),MY,inplace = True) # The 'centroids' are replaced with the new values
          z = list(range(k)) # Initialisation of z and centroid of each cluster.
          for p in range(0,k): # loop calculates square of distances between data points and centroid of each cluster.
              z[p] = ((cluster[cluster['centroid number'] == p+1][['Awan3pm','Kelembaban3pm']] - centroids.iloc[p])**2).values.sum()
          tmp_eps[i] = sum(z) 
          epsilon[k-1] = tmp_eps[i] # The cost function after final iteration for each value of 'k' would be stored in epsilon.
    %reset_selective -f centroids # The dataframe 'centroids' is reset.
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-50-7cbf4fdf1520> in <module>()
     12     for i in range(0,number_of_iterations): # loop is for iterations
     13           for j in range(0,rows):
---> 14               d[j] = ((centroids - cluster.loc[j])**2).sum(axis = 1).idxmin()
     15           cluster['centroid number'] = d #new column 'centroid number' is added to dataframe 'cluster'
     16           MX = list(range(k)) # Initialisation of 'MX' which will store mean of 'x' values of each cluster

/usr/local/lib/python3.7/dist-packages/pandas/core/ops/__init__.py in f(self, other, axis, level, fill_value)
    653         if isinstance(other, ABCDataFrame):
    654             # Another DataFrame
--> 655             new_data = self._combine_frame(other, na_op, fill_value)
    656 
    657         elif isinstance(other, ABCSeries):

/usr/local/lib/python3.7/dist-packages/pandas/core/frame.py in _combine_frame(self, other, func, fill_value)
   5868                 return func(left, right)
   5869 
-> 5870         new_data = ops.dispatch_to_series(self, other, _arith_op)
   5871         return new_data
   5872 

/usr/local/lib/python3.7/dist-packages/pandas/core/ops/__init__.py in dispatch_to_series(left, right, func, axis)
    273         #  _frame_arith_method_with_reindex
    274 
--> 275         bm = left._mgr.operate_blockwise(right._mgr, array_op)
    276         return type(left)(bm)
    277 

/usr/local/lib/python3.7/dist-packages/pandas/core/internals/managers.py in operate_blockwise(self, other, array_op)
    365         Apply array_op blockwise with another (aligned) BlockManager.
    366         """
--> 367         return operate_blockwise(self, other, array_op)
    368 
    369     def apply(self: T, f, align_keys=None, **kwargs) -> T:

/usr/local/lib/python3.7/dist-packages/pandas/core/internals/ops.py in operate_blockwise(left, right, array_op)
     36             lvals, rvals = _get_same_shape_values(blk, rblk, left_ea, right_ea)
     37 
---> 38             res_values = array_op(lvals, rvals)
     39             if left_ea and not right_ea and hasattr(res_values, "reshape"):
     40                 res_values = res_values.reshape(1, -1)

/usr/local/lib/python3.7/dist-packages/pandas/core/ops/array_ops.py in arithmetic_op(left, right, op)
    188     else:
    189         with np.errstate(all="ignore"):
--> 190             res_values = na_arithmetic_op(lvalues, rvalues, op)
    191 
    192     return res_values

/usr/local/lib/python3.7/dist-packages/pandas/core/ops/array_ops.py in na_arithmetic_op(left, right, op, is_cmp)
    141 
    142     try:
--> 143         result = expressions.evaluate(op, left, right)
    144     except TypeError:
    145         if is_cmp:

/usr/local/lib/python3.7/dist-packages/pandas/core/computation/expressions.py in evaluate(op, a, b, use_numexpr)
    231         use_numexpr = use_numexpr and _bool_arith_check(op_str, a, b)
    232         if use_numexpr:
--> 233             return _evaluate(op, op_str, a, b)  # type: ignore
    234     return _evaluate_standard(op, op_str, a, b)
    235 

/usr/local/lib/python3.7/dist-packages/pandas/core/computation/expressions.py in _evaluate_numexpr(op, op_str, a, b)
     98     result = None
     99 
--> 100     if _can_use_numexpr(op, op_str, a, b, "evaluate"):
    101         is_reversed = op.__name__.strip("_").startswith("r")
    102         if is_reversed:

/usr/local/lib/python3.7/dist-packages/pandas/core/computation/expressions.py in _can_use_numexpr(op, op_str, a, b, dtype_check)
     74 
     75         # required min elements (otherwise we are adding overhead)
---> 76         if np.prod(a.shape) > _MIN_ELEMENTS:
     77             # check for dtype compatibility
     78             dtypes = set()

<__array_function__ internals> in prod(*args, **kwargs)

/usr/local/lib/python3.7/dist-packages/numpy/core/fromnumeric.py in prod(a, axis, dtype, out, keepdims, initial, where)
   2998     """
   2999     return _wrapreduction(a, np.multiply, 'prod', axis, dtype, out,
-> 3000                           keepdims=keepdims, initial=initial, where=where)
   3001 
   3002 

/usr/local/lib/python3.7/dist-packages/numpy/core/fromnumeric.py in _wrapreduction(obj, ufunc, method, axis, dtype, out, **kwargs)
     85                 return reduction(axis=axis, out=out, **passkwargs)
     86 
---> 87     return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
     88 
     89 

KeyboardInterrupt: 
In [ ]:
print(epsilon) #cek nilai epsilon
[2448.384170676461, 895.2633686650431, 734.3420954055964, 3, 4]
In [ ]:
k=list(range(1,6)) # range dari 'k'.
plt.figure(figsize=(10,8)) # size adjusted.
plt.xticks(fontsize=20) # size pada x-axis adjusted.
plt.yticks(fontsize=20) # size pada y-axis adjusted.
plt.plot(k,epsilon,'go--', linewidth=1.5, markersize=4) # Graph is plotted.
plt.xlabel('Nilai dari k',fontsize = 20) # x-axis.
plt.ylabel('Nilai dari epsilon',fontsize = 20) # y-axis.
Out[ ]:
Text(0, 0.5, 'Nilai dari epsilon')

Jadi nilai k berada di 2, karena proses terjadinya penekukan berada pada nilai 2

Mengkonversi dataframe ke dalam bentuk array agar lebih mudah diolah

In [ ]:
#untuk data yang scale
Awan3pmScaled=scalled.Awan3pm #Scale Awan3pmScaled
Kelembaban3pmScaled=scalled.Kelembaban3pm #Scale Kelembaban3pmScaled
SaljuBesokScale=scalled.BersaljuBesok #Scale SaljuBesokScale
ScaleNew=[]
i=0
while i < len(scalled):
      DT1 = [Awan3pmScaled[i], Kelembaban3pmScaled[i], SaljuBesokScale[i]]
      ScaleNew.append(DT1)
      i += 1
In [ ]:
#untuk data yang unscale
Awan3pmNScaled=Nscalled.Awan3pm #Unscale Awan3pmNScaled
Kelembaban3pmNScaled=Nscalled.Kelembaban3pm #Unscale Kelembaban3pmNScaled
SaljuBesokNScale=Nscalled.BersaljuBesok #Unscale SaljuBesokNScale
NScaleNew=[]
i=0
while i < len(Nscalled):
      DT2=[Awan3pmNScaled[i],Kelembaban3pmNScaled[i],SaljuBesokNScale[i]]
      NScaleNew.append(DT2)
      i += 1

untuk mencari nilai distance antara dua titik menggunakan fungsi manhattan distance

In [ ]:
def Mdist(centroid, data): #manhattan distance
      Md=abs(float(centroid[0]-data[0]))+abs(float(centroid[1]-data[1])) 
      return Md #return

menentukan centroid baru pada setiap iterasi

In [ ]:
def Cent(cluster):
      x=0 #initiate x
      y=0 #initiate y
      for i in range(len(cluster)): 
            x=x+cluster[i][0]
            y=y+cluster[i][1]
      avgx=x/len(cluster)
      avgy=y/len(cluster)
      centroid=[avgx,avgy]
      return centroid
In [ ]:
def kmeans(dataset,maxitr):
      cent1=dataset[rd.randint(0,39744)]
      cnet2=dataset[rd.randint(0,39744)]
      minus=1
      itr=0
      while (minus!=0) and (itr<maxitr):
          cls1=[]
          cls2=[]
          cent1old=cent1
          cent2old=cnet2
          for j in range(len(dataset)):
              dist1=Mdist(cent1old,dataset[j])
              dist2=Mdist(cent2old,dataset[j])
              if dist1<dist2:
                  cls1.append(dataset[j])
              else:
                  cls2.append(dataset[j])
          cent1=Cent(cls1)
          cnet2=Cent(cls2)
          minus=(cent1[0]-cent1old[0])+(cent1[1]-cent1old[1])+(cnet2[0]-cent2old[0])+(cnet2[1]-cent2old[1])
          itr+=1
      centroids=[cent1, cnet2]
      return centroids,cls1,cls2
In [ ]:
SCent, scaledCluster1, scaledCluster2 = kmeans(ScaleNew, 100)
NSCent, unscaledCluster1, unscaledCluster2 = kmeans(NScaleNew, 100)

scaled

In [ ]:
c1AwanScale = [] #Scale
c1KelembabanScale = [] #Scale
c2AwanScale = [] #Scale
c2KelembabanScale = [] #Scale
for k in range(len(scaledCluster1)):
      c1AwanScale.append(scaledCluster1[k][0])
      c1KelembabanScale.append(scaledCluster1[k][1])
for l in range(len(scaledCluster2)):
      c2AwanScale.append(scaledCluster2[l][0])
      c2KelembabanScale.append(scaledCluster2[l][1])

unscaled

In [ ]:
c1AwanNScale = [] #Unscale
c1KelembabanNScale = [] #Unscale
c2AwanNScale = [] #Unscale
c2KelembabanNScale = [] #Unscale
for k in range(len(unscaledCluster1)):
      c1AwanNScale.append(unscaledCluster1[k][0])
      c1KelembabanNScale.append(unscaledCluster1[k][1])
for l in range(len(unscaledCluster2)):
      c2AwanNScale.append(unscaledCluster2[l][0])
      c2KelembabanNScale.append(unscaledCluster2[l][1])

cls1 dan cls2 ke dalam bentuk grafik

visualisasi clustering yang sudah discaling

In [ ]:
plt.scatter(c1AwanScale,c1KelembabanScale,c='y',edgecolors='black',linewidth=0.20) #c1awankelembabanscale
plt.scatter(c2AwanScale,c2KelembabanScale,c='r',edgecolors='black',linewidth=0.20) #c2awankelembabanscale
plt.scatter(SCent[0][0],SCent[0][1],c ='g',edgecolors='magenta',linewidth=0.20) #scale centroid
plt.scatter(SCent[1][0],SCent[1][1],c ='g',edgecolors='magenta',linewidth=0.20) #scale centroid
plt.title('Scaled dataset') #title
plt.xlabel('Awan3pm') #xlabel
plt.ylabel('Kelembaban3pm') #ylabel
plt.show()

visualisasi clustering yang belum discaling

In [ ]:
plt.scatter(c1AwanNScale,c1KelembabanNScale,c='y',edgecolors='black',linewidth=0.20) #c1awankelembabanNscale
plt.scatter(c2AwanNScale,c2KelembabanNScale,c='r',edgecolors='black',linewidth=0.20) #c2awankelembabanNscale
plt.scatter(NSCent[0][0],NSCent[0][1],c ='g',edgecolors='magenta',linewidth=0.20) #Nscale centroid
plt.scatter(NSCent[1][0],NSCent[1][1],c ='g',edgecolors='magenta',linewidth=0.20) #Nscale centroid
plt.title('Unscaled dataset') #title
plt.xlabel('Awan3pm') #xlabel
plt.ylabel('Kelembaban3pm') #ylabel
plt.show()

mengembalikan array dalam bentuk dataframe

In [ ]:
c1AwanScale,c1KelembabanScale,scaledBersaljubesokc1=zip(*scaledCluster1) #cluster1scale
scaleCluster1=pd.DataFrame({'Awan3pm':c1AwanScale,'Kelembaban3pm':c1KelembabanScale},columns=['Awan3pm','Kelembaban3pm'])#cluster1scale
c2AwanScale,c2KelembabanScale,scaledBersaljubesokc2=zip(*scaledCluster2)#cluster2scale
scaleCluster2=pd.DataFrame({'Awan3pm':c2AwanScale,'Kelembaban3pm':c2KelembabanScale},columns=['Awan3pm','Kelembaban3pm']) #cluster2scale
In [ ]:
scaleCluster1['Cluster']='Tidak Bersalju' #cluster1scale
scaleCluster2['Cluster']='Bersalju' #cluster2scale

Dataset yang sudah discaling dan clustering

In [ ]:
scaleCluster=pd.concat([scaleCluster1,scaleCluster2],axis=0) #scaleCluster
scaleCluster
scaleCluster.groupby("Cluster").size() #scaleCluster size
In [ ]:
c1AwanNScale,c1KelembabanNScale,unscaledBersaljubesokc1=zip(*unscaledCluster1) #cluster1Nscale
NscaleCluster1=pd.DataFrame({'Awan3pm':c1AwanNScale,'Kelembaban3pm':c1KelembabanNScale},columns=['Awan3pm','Kelembaban3pm']) #cluster1Nscale
c2AwanNScale,c2KelembabanNScale,unscaledBersaljubesokc2=zip(*unscaledCluster2) #cluster2Nscale
NscaleCluster2=pd.DataFrame({'Awan3pm':c2AwanNScale,'Kelembaban3pm':c2KelembabanNScale},columns=['Awan3pm','Kelembaban3pm']) #cluster2Nscale
In [ ]:
NscaleCluster1['Cluster'] = 'Tidak Bersalju' #cluster1Nscale
NscaleCluster2['Cluster'] = 'Bersalju' #cluster2Nscale

Dataset yang belum discaling tapi sudah diclustering

In [ ]:
NscaleCluster = pd.concat([NscaleCluster1, NscaleCluster2], axis=0) #NscaleCluster
NscaleCluster
NscaleCluster.groupby("Cluster").size() #NscaleCluster size

export data ke csv

In [ ]:
scaleCluster.to_csv('ClusteringScalled.csv') #save to csv
NscaleCluster.to_csv('ClusteringUnscalled.csv') #save to csv
In [ ]:
files.download('ClusteringScalled.csv') #download csv
In [ ]:
files.download('ClusteringUnscalled.csv') #download csv