import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import random as rd
from pandas.plotting import scatter_matrix
import seaborn as sns
import missingno as msno
import warnings
from matplotlib import style
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
warnings.filterwarnings('ignore')
mpl.rcParams['figure.dpi'] = 100
!git clone https://github.com/andrewsihotang/tubesMLone.git
fatal: destination path 'tubesMLone' already exists and is not an empty directory.
data1 = pd.read_csv('/content/tubesMLone/salju/salju_train.csv')
data1.shape
(109095, 24)
data1.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 109095 entries, 0 to 109094 Data columns (total 24 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 109095 non-null int64 1 Tanggal 109095 non-null object 2 KodeLokasi 109095 non-null object 3 SuhuMin 107973 non-null float64 4 SuhuMax 108166 non-null float64 5 Hujan 106664 non-null float64 6 Penguapan 62071 non-null float64 7 SinarMatahari 56716 non-null float64 8 ArahAnginTerkencang 101351 non-null object 9 KecepatanAnginTerkencang 101399 non-null float64 10 ArahAngin9am 101172 non-null object 11 ArahAngin3pm 105898 non-null object 12 KecepatanAngin9am 107742 non-null float64 13 KecepatanAngin3pm 106792 non-null float64 14 Kelembaban9am 107093 non-null float64 15 Kelembaban3pm 105721 non-null float64 16 Tekanan9am 97768 non-null float64 17 Tekanan3pm 97787 non-null float64 18 Awan9am 67251 non-null float64 19 Awan3pm 64624 non-null float64 20 Suhu9am 107755 non-null float64 21 Suhu3pm 106397 non-null float64 22 BersaljuHariIni 106664 non-null object 23 BersaljuBesok 106664 non-null object dtypes: float64(16), int64(1), object(7) memory usage: 20.0+ MB
data1.dtypes
id int64 Tanggal object KodeLokasi object SuhuMin float64 SuhuMax float64 Hujan float64 Penguapan float64 SinarMatahari float64 ArahAnginTerkencang object KecepatanAnginTerkencang float64 ArahAngin9am object ArahAngin3pm object KecepatanAngin9am float64 KecepatanAngin3pm float64 Kelembaban9am float64 Kelembaban3pm float64 Tekanan9am float64 Tekanan3pm float64 Awan9am float64 Awan3pm float64 Suhu9am float64 Suhu3pm float64 BersaljuHariIni object BersaljuBesok object dtype: object
data1.describe()
id | SuhuMin | SuhuMax | Hujan | Penguapan | SinarMatahari | KecepatanAnginTerkencang | KecepatanAngin9am | KecepatanAngin3pm | Kelembaban9am | Kelembaban3pm | Tekanan9am | Tekanan3pm | Awan9am | Awan3pm | Suhu9am | Suhu3pm | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 109095.000000 | 107973.000000 | 108166.000000 | 106664.000000 | 62071.000000 | 56716.000000 | 101399.000000 | 107742.000000 | 106792.000000 | 107093.000000 | 105721.000000 | 97768.000000 | 97787.000000 | 67251.000000 | 64624.000000 | 107755.000000 | 106397.000000 |
mean | 54548.000000 | 12.196183 | 23.214819 | 2.385005 | 5.462440 | 7.599527 | 40.032002 | 14.052115 | 18.677579 | 68.895577 | 51.567626 | 1017.647080 | 1015.253117 | 4.450893 | 4.516140 | 16.991391 | 21.672771 |
std | 31493.158146 | 6.389419 | 7.106596 | 8.588155 | 4.201638 | 3.789042 | 13.617554 | 8.926092 | 8.830199 | 18.995528 | 20.791573 | 7.117338 | 7.047875 | 2.884566 | 2.718738 | 6.477602 | 6.922833 |
min | 1.000000 | -8.500000 | -4.800000 | 0.000000 | 0.000000 | 0.000000 | 7.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 980.500000 | 977.100000 | 0.000000 | 0.000000 | -7.200000 | -5.400000 |
25% | 27274.500000 | 7.600000 | 17.900000 | 0.000000 | 2.600000 | 4.800000 | 31.000000 | 7.000000 | 13.000000 | 57.000000 | 37.000000 | 1012.900000 | 1010.400000 | 1.000000 | 2.000000 | 12.300000 | 16.600000 |
50% | 54548.000000 | 12.000000 | 22.600000 | 0.000000 | 4.800000 | 8.400000 | 39.000000 | 13.000000 | 19.000000 | 70.000000 | 52.000000 | 1017.600000 | 1015.200000 | 5.000000 | 5.000000 | 16.700000 | 21.100000 |
75% | 81821.500000 | 16.800000 | 28.200000 | 0.800000 | 7.400000 | 10.600000 | 48.000000 | 19.000000 | 24.000000 | 83.000000 | 66.000000 | 1022.400000 | 1020.000000 | 7.000000 | 7.000000 | 21.600000 | 26.400000 |
max | 109095.000000 | 33.900000 | 47.300000 | 371.000000 | 145.000000 | 14.300000 | 135.000000 | 130.000000 | 87.000000 | 100.000000 | 100.000000 | 1041.000000 | 1039.600000 | 9.000000 | 9.000000 | 40.200000 | 46.700000 |
data1.head()
id | Tanggal | KodeLokasi | SuhuMin | SuhuMax | Hujan | Penguapan | SinarMatahari | ArahAnginTerkencang | KecepatanAnginTerkencang | ArahAngin9am | ArahAngin3pm | KecepatanAngin9am | KecepatanAngin3pm | Kelembaban9am | Kelembaban3pm | Tekanan9am | Tekanan3pm | Awan9am | Awan3pm | Suhu9am | Suhu3pm | BersaljuHariIni | BersaljuBesok | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 01/06/2014 | C4 | 10.4 | 15.5 | 4.8 | NaN | NaN | WSW | 24.0 | NaN | WSW | 0.0 | 13.0 | 78.0 | 76.0 | 1020.1 | 1018.5 | NaN | NaN | 13.1 | 15.0 | Ya | Tidak |
1 | 2 | 15/07/2014 | C10 | 9.0 | 17.0 | 8.0 | 2.6 | 7.4 | NaN | NaN | SW | WNW | 13.0 | 20.0 | 80.0 | 61.0 | 1015.2 | 1014.6 | 7.0 | 5.0 | 11.9 | 15.5 | Ya | Ya |
2 | 3 | 16/02/2011 | C46 | 18.2 | 32.0 | 0.0 | NaN | NaN | ESE | 44.0 | SE | SE | 15.0 | 26.0 | 62.0 | 42.0 | NaN | NaN | NaN | NaN | 23.8 | 29.6 | Tidak | Tidak |
3 | 4 | 08/08/2012 | C36 | 7.3 | 24.5 | 0.0 | 8.4 | 10.4 | SSW | 54.0 | N | SW | 13.0 | 19.0 | 25.0 | 17.0 | 1019.2 | 1016.9 | 1.0 | 7.0 | 15.3 | 23.2 | Tidak | Tidak |
4 | 5 | 29/10/2016 | C7 | 5.9 | 20.3 | 0.0 | 3.6 | 12.6 | N | 37.0 | NNW | ESE | 22.0 | 19.0 | 55.0 | 48.0 | 1019.7 | 1014.7 | 2.0 | 6.0 | 12.4 | 18.1 | Tidak | Tidak |
data1.isnull().sum()
id 0 Tanggal 0 KodeLokasi 0 SuhuMin 1122 SuhuMax 929 Hujan 2431 Penguapan 47024 SinarMatahari 52379 ArahAnginTerkencang 7744 KecepatanAnginTerkencang 7696 ArahAngin9am 7923 ArahAngin3pm 3197 KecepatanAngin9am 1353 KecepatanAngin3pm 2303 Kelembaban9am 2002 Kelembaban3pm 3374 Tekanan9am 11327 Tekanan3pm 11308 Awan9am 41844 Awan3pm 44471 Suhu9am 1340 Suhu3pm 2698 BersaljuHariIni 2431 BersaljuBesok 2431 dtype: int64
#cek missing value
total=data1.isnull().sum().sort_values(ascending = False)
print(total)
SinarMatahari 52379 Penguapan 47024 Awan3pm 44471 Awan9am 41844 Tekanan9am 11327 Tekanan3pm 11308 ArahAngin9am 7923 ArahAnginTerkencang 7744 KecepatanAnginTerkencang 7696 Kelembaban3pm 3374 ArahAngin3pm 3197 Suhu3pm 2698 Hujan 2431 BersaljuBesok 2431 BersaljuHariIni 2431 KecepatanAngin3pm 2303 Kelembaban9am 2002 KecepatanAngin9am 1353 Suhu9am 1340 SuhuMin 1122 SuhuMax 929 KodeLokasi 0 Tanggal 0 id 0 dtype: int64
data1.dropna(inplace=True)
data1.shape
(42411, 24)
#remove data null
# data1 = data1.dropna(how='any',subset=['Penguapan','Hujan','SuhuMin','SuhuMax'])
data1.isnull().sum()
id 0 Tanggal 0 KodeLokasi 0 SuhuMin 0 SuhuMax 0 Hujan 0 Penguapan 0 SinarMatahari 0 ArahAnginTerkencang 0 KecepatanAnginTerkencang 0 ArahAngin9am 0 ArahAngin3pm 0 KecepatanAngin9am 0 KecepatanAngin3pm 0 Kelembaban9am 0 Kelembaban3pm 0 Tekanan9am 0 Tekanan3pm 0 Awan9am 0 Awan3pm 0 Suhu9am 0 Suhu3pm 0 BersaljuHariIni 0 BersaljuBesok 0 dtype: int64
convert cal menjadi numerical agar dapat dilihat korelasinya
cal = data1.dtypes==object
cal_col = data1.columns[cal].tolist()
data1[cal_col] = data1[cal_col].apply(lambda col: LabelEncoder().fit_transform(col))
data1[cal_col].head()
data1
id | Tanggal | KodeLokasi | SuhuMin | SuhuMax | Hujan | Penguapan | SinarMatahari | ArahAnginTerkencang | KecepatanAnginTerkencang | ArahAngin9am | ArahAngin3pm | KecepatanAngin9am | KecepatanAngin3pm | Kelembaban9am | Kelembaban3pm | Tekanan9am | Tekanan3pm | Awan9am | Awan3pm | Suhu9am | Suhu3pm | BersaljuHariIni | BersaljuBesok | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
3 | 4 | 849 | 15 | 7.3 | 24.5 | 0.0 | 8.4 | 10.4 | 11 | 54.0 | 3 | 12 | 13.0 | 19.0 | 25.0 | 17.0 | 1019.2 | 1016.9 | 1.0 | 7.0 | 15.3 | 23.2 | 0 | 0 |
4 | 5 | 3188 | 23 | 5.9 | 20.3 | 0.0 | 3.6 | 12.6 | 3 | 37.0 | 6 | 2 | 22.0 | 19.0 | 55.0 | 48.0 | 1019.7 | 1014.7 | 2.0 | 6.0 | 12.4 | 18.1 | 0 | 0 |
5 | 6 | 1271 | 2 | 14.4 | 21.8 | 0.0 | 3.2 | 4.4 | 12 | 39.0 | 12 | 11 | 19.0 | 20.0 | 63.0 | 52.0 | 1016.1 | 1012.5 | 7.0 | 7.0 | 16.7 | 21.1 | 0 | 0 |
6 | 7 | 1380 | 15 | 7.7 | 18.7 | 0.2 | 5.6 | 9.7 | 14 | 46.0 | 7 | 14 | 19.0 | 28.0 | 69.0 | 31.0 | 1011.3 | 1008.8 | 1.0 | 1.0 | 11.3 | 18.3 | 0 | 0 |
8 | 9 | 1903 | 24 | 18.4 | 35.3 | 0.0 | 10.0 | 12.5 | 1 | 33.0 | 0 | 2 | 11.0 | 13.0 | 44.0 | 18.0 | 1017.9 | 1013.4 | 0.0 | 0.0 | 23.7 | 34.9 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
109080 | 109081 | 232 | 18 | 16.8 | 34.1 | 0.0 | 12.8 | 10.3 | 1 | 85.0 | 1 | 1 | 30.0 | 37.0 | 48.0 | 28.0 | 1013.4 | 1009.2 | 1.0 | 4.0 | 25.6 | 33.0 | 0 | 0 |
109082 | 109083 | 2396 | 4 | 8.7 | 19.0 | 0.0 | 1.4 | 9.6 | 13 | 24.0 | 13 | 9 | 22.0 | 11.0 | 81.0 | 59.0 | 1024.6 | 1022.3 | 2.0 | 2.0 | 10.8 | 16.5 | 0 | 0 |
109088 | 109089 | 1877 | 6 | 14.3 | 26.2 | 0.0 | 8.0 | 12.6 | 5 | 50.0 | 7 | 4 | 13.0 | 33.0 | 51.0 | 37.0 | 1019.2 | 1015.6 | 0.0 | 2.0 | 21.1 | 25.5 | 0 | 0 |
109090 | 109091 | 3309 | 17 | 20.1 | 23.7 | 0.0 | 7.2 | 8.9 | 2 | 43.0 | 9 | 2 | 24.0 | 26.0 | 74.0 | 70.0 | 1019.3 | 1017.6 | 4.0 | 6.0 | 22.0 | 22.1 | 0 | 1 |
109093 | 109094 | 1696 | 1 | 10.8 | 29.8 | 0.0 | 7.8 | 11.2 | 0 | 48.0 | 2 | 9 | 13.0 | 26.0 | 35.0 | 18.0 | 1020.0 | 1015.8 | 0.0 | 1.0 | 21.7 | 29.2 | 0 | 0 |
42411 rows × 24 columns
membandingkan keterkaitan
plt.figure(figsize=(20,20))
sns.heatmap(data1.corr(),cmap='coolwarm',annot=True,linewidths=1);
selesksi beberapa atribut yang bernilai besar
baru = ["BersaljuBesok", "SuhuMin", "SuhuMax", "Suhu9am", "Suhu3pm", "Penguapan", "SinarMatahari", "Awan9am", "Awan3pm", "Kelembaban9am", "Kelembaban3pm"]
newData = data1[baru]
newData.head()
newData
BersaljuBesok | SuhuMin | SuhuMax | Suhu9am | Suhu3pm | Penguapan | SinarMatahari | Awan9am | Awan3pm | Kelembaban9am | Kelembaban3pm | |
---|---|---|---|---|---|---|---|---|---|---|---|
3 | 0 | 7.3 | 24.5 | 15.3 | 23.2 | 8.4 | 10.4 | 1.0 | 7.0 | 25.0 | 17.0 |
4 | 0 | 5.9 | 20.3 | 12.4 | 18.1 | 3.6 | 12.6 | 2.0 | 6.0 | 55.0 | 48.0 |
5 | 0 | 14.4 | 21.8 | 16.7 | 21.1 | 3.2 | 4.4 | 7.0 | 7.0 | 63.0 | 52.0 |
6 | 0 | 7.7 | 18.7 | 11.3 | 18.3 | 5.6 | 9.7 | 1.0 | 1.0 | 69.0 | 31.0 |
8 | 0 | 18.4 | 35.3 | 23.7 | 34.9 | 10.0 | 12.5 | 0.0 | 0.0 | 44.0 | 18.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
109080 | 0 | 16.8 | 34.1 | 25.6 | 33.0 | 12.8 | 10.3 | 1.0 | 4.0 | 48.0 | 28.0 |
109082 | 0 | 8.7 | 19.0 | 10.8 | 16.5 | 1.4 | 9.6 | 2.0 | 2.0 | 81.0 | 59.0 |
109088 | 0 | 14.3 | 26.2 | 21.1 | 25.5 | 8.0 | 12.6 | 0.0 | 2.0 | 51.0 | 37.0 |
109090 | 1 | 20.1 | 23.7 | 22.0 | 22.1 | 7.2 | 8.9 | 4.0 | 6.0 | 74.0 | 70.0 |
109093 | 0 | 10.8 | 29.8 | 21.7 | 29.2 | 7.8 | 11.2 | 0.0 | 1.0 | 35.0 | 18.0 |
42411 rows × 11 columns
mpl.rcParams['figure.dpi'] = 100
plt.figure(figsize=(10,50))
f, axes = plt.subplots(1, 4)
sns.boxplot(y=newData["SuhuMin"],ax=axes[0]) #SuhuMin
sns.boxplot(y=newData["SuhuMax"],ax=axes[1]) #SuhuMax
sns.boxplot(y=newData["Suhu9am"],ax=axes[2]) #Suhu9am
sns.boxplot(y=newData["Suhu3pm"],ax=axes[3]) #Suhu3pm
plt.subplots_adjust(wspace=8)
<Figure size 1000x5000 with 0 Axes>
mpl.rcParams['figure.dpi'] = 100
plt.figure(figsize=(10,50))
f, axes = plt.subplots(1, 6)
sns.boxplot(y=newData["Penguapan"],ax=axes[0]) #Penguapan
sns.boxplot(y=newData["SinarMatahari"],ax=axes[1]) #SinarMatahari
sns.boxplot(y=newData["Awan9am"],ax=axes[2]) #Awan9am
sns.boxplot(y=newData["Awan3pm"],ax=axes[3]) #Awan3pm
sns.boxplot(y=newData["Kelembaban9am"],ax=axes[4]) #Kelembaban9am
sns.boxplot(y=newData["Kelembaban3pm"],ax=axes[5]) #Kelembaban3pm
plt.subplots_adjust(wspace=8)
<Figure size 1000x5000 with 0 Axes>
Data tergoglong outlier akan didrop (SuhuMin, SuhuMax, Suhu9am, Suhu3pm, Penguapan, dan Kelembaban9am)
newData.drop(newData[newData.SuhuMin < -5].index,inplace=True) #SuhuMin
newData.drop(newData[newData.SuhuMax > 40].index,inplace=True) #SuhuMax
newData.drop(newData[newData.Suhu9am > 37].index,inplace=True) #Suhu9am
newData.drop(newData[newData.Suhu3pm > 41].index,inplace=True) #Suhu3pm
newData.drop(newData[newData.Penguapan > 13].index,inplace=True) #Penguapan
newData.drop(newData[newData.Kelembaban9am < 20].index,inplace=True) #Kelembaban9am
newData
BersaljuBesok | SuhuMin | SuhuMax | Suhu9am | Suhu3pm | Penguapan | SinarMatahari | Awan9am | Awan3pm | Kelembaban9am | Kelembaban3pm | |
---|---|---|---|---|---|---|---|---|---|---|---|
3 | 0 | 7.3 | 24.5 | 15.3 | 23.2 | 8.4 | 10.4 | 1.0 | 7.0 | 25.0 | 17.0 |
4 | 0 | 5.9 | 20.3 | 12.4 | 18.1 | 3.6 | 12.6 | 2.0 | 6.0 | 55.0 | 48.0 |
5 | 0 | 14.4 | 21.8 | 16.7 | 21.1 | 3.2 | 4.4 | 7.0 | 7.0 | 63.0 | 52.0 |
6 | 0 | 7.7 | 18.7 | 11.3 | 18.3 | 5.6 | 9.7 | 1.0 | 1.0 | 69.0 | 31.0 |
8 | 0 | 18.4 | 35.3 | 23.7 | 34.9 | 10.0 | 12.5 | 0.0 | 0.0 | 44.0 | 18.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
109080 | 0 | 16.8 | 34.1 | 25.6 | 33.0 | 12.8 | 10.3 | 1.0 | 4.0 | 48.0 | 28.0 |
109082 | 0 | 8.7 | 19.0 | 10.8 | 16.5 | 1.4 | 9.6 | 2.0 | 2.0 | 81.0 | 59.0 |
109088 | 0 | 14.3 | 26.2 | 21.1 | 25.5 | 8.0 | 12.6 | 0.0 | 2.0 | 51.0 | 37.0 |
109090 | 1 | 20.1 | 23.7 | 22.0 | 22.1 | 7.2 | 8.9 | 4.0 | 6.0 | 74.0 | 70.0 |
109093 | 0 | 10.8 | 29.8 | 21.7 | 29.2 | 7.8 | 11.2 | 0.0 | 1.0 | 35.0 | 18.0 |
40421 rows × 11 columns
simpan dan export ke file bernama Nscaled_salju_train.csv
from google.colab import files
newData.to_csv('Nscaled_salju_train.csv')
files.download('Nscaled_salju_train.csv')
# !pip install --upgrade gupload
# from pydrive.auth import GoogleAuth
# from google.colab import auth
# # Authenticate and create the PyDrive client
# auth.authenticate_user()
# !gupload --to '13mwhSIIJAgHy-zx_E1bomFCH-0TcBImt' Nscaled_salju_train.csv
Menyeragamkan scaling pada tiap atribut
scalling=MinMaxScaler()
sclr=scalling.fit_transform(newData)
colNew=["BersaljuBesok", "SuhuMin", "SuhuMax", "Suhu9am", "Suhu3pm", "Penguapan","SinarMatahari", "Awan9am", "Awan3pm", "Kelembaban9am", "Kelembaban3pm"]
scalled=pd.DataFrame(sclr,columns=colNew)
scalled
BersaljuBesok | SuhuMin | SuhuMax | Suhu9am | Suhu3pm | Penguapan | SinarMatahari | Awan9am | Awan3pm | Kelembaban9am | Kelembaban3pm | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | 0.352601 | 0.540059 | 0.431267 | 0.535411 | 0.646154 | 0.727273 | 0.125 | 0.777778 | 0.0625 | 0.161616 |
1 | 0.0 | 0.312139 | 0.415430 | 0.353100 | 0.390935 | 0.276923 | 0.881119 | 0.250 | 0.666667 | 0.4375 | 0.474747 |
2 | 0.0 | 0.557803 | 0.459941 | 0.469003 | 0.475921 | 0.246154 | 0.307692 | 0.875 | 0.777778 | 0.5375 | 0.515152 |
3 | 0.0 | 0.364162 | 0.367953 | 0.323450 | 0.396601 | 0.430769 | 0.678322 | 0.125 | 0.111111 | 0.6125 | 0.303030 |
4 | 0.0 | 0.673410 | 0.860534 | 0.657682 | 0.866856 | 0.769231 | 0.874126 | 0.000 | 0.000000 | 0.3000 | 0.171717 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
40416 | 0.0 | 0.627168 | 0.824926 | 0.708895 | 0.813031 | 0.984615 | 0.720280 | 0.125 | 0.444444 | 0.3500 | 0.272727 |
40417 | 0.0 | 0.393064 | 0.376855 | 0.309973 | 0.345609 | 0.107692 | 0.671329 | 0.250 | 0.222222 | 0.7625 | 0.585859 |
40418 | 0.0 | 0.554913 | 0.590504 | 0.587601 | 0.600567 | 0.615385 | 0.881119 | 0.000 | 0.222222 | 0.3875 | 0.363636 |
40419 | 1.0 | 0.722543 | 0.516320 | 0.611860 | 0.504249 | 0.553846 | 0.622378 | 0.500 | 0.666667 | 0.6750 | 0.696970 |
40420 | 0.0 | 0.453757 | 0.697329 | 0.603774 | 0.705382 | 0.600000 | 0.783217 | 0.000 | 0.111111 | 0.1875 | 0.171717 |
40421 rows × 11 columns
simpan dan export ke file bernama scaled_salju_train.csv
from google.colab import files
scalled.to_csv('scaled_salju_train.csv')
files.download('scaled_salju_train.csv')
cek korelasi kembali
plt.figure(figsize=(10,10))
sns.heatmap(scalled.corr(),cmap='coolwarm',annot=True,linewidths=1);
Memilih atribut Awan3pm dan Kelembaban3pm karena memiliki nilai korelasi tertinggi terhadap atribut BersaljuBesok
scalled=pd.read_csv("scaled_salju_train.csv",usecols=["Awan3pm","Kelembaban3pm","BersaljuBesok"])
scalled
BersaljuBesok | Awan3pm | Kelembaban3pm | |
---|---|---|---|
0 | 0.0 | 0.777778 | 0.161616 |
1 | 0.0 | 0.666667 | 0.474747 |
2 | 0.0 | 0.777778 | 0.515152 |
3 | 0.0 | 0.111111 | 0.303030 |
4 | 0.0 | 0.000000 | 0.171717 |
... | ... | ... | ... |
40416 | 0.0 | 0.444444 | 0.272727 |
40417 | 0.0 | 0.222222 | 0.585859 |
40418 | 0.0 | 0.222222 | 0.363636 |
40419 | 1.0 | 0.666667 | 0.696970 |
40420 | 0.0 | 0.111111 | 0.171717 |
40421 rows × 3 columns
from google.colab import drive
drive.mount('/content/drive',force_remount=True)
Mounted at /content/drive
%cd ..
%cd /content/
/ /content
scalled=pd.read_csv("scaled_salju_train.csv",usecols=["Awan3pm","Kelembaban3pm","BersaljuBesok"])
scalled
BersaljuBesok | Awan3pm | Kelembaban3pm | |
---|---|---|---|
0 | 0.0 | 0.777778 | 0.161616 |
1 | 0.0 | 0.666667 | 0.474747 |
2 | 0.0 | 0.777778 | 0.515152 |
3 | 0.0 | 0.111111 | 0.303030 |
4 | 0.0 | 0.000000 | 0.171717 |
... | ... | ... | ... |
40416 | 0.0 | 0.444444 | 0.272727 |
40417 | 0.0 | 0.222222 | 0.585859 |
40418 | 0.0 | 0.222222 | 0.363636 |
40419 | 1.0 | 0.666667 | 0.696970 |
40420 | 0.0 | 0.111111 | 0.171717 |
40421 rows × 3 columns
Nscalled=pd.read_csv("Nscaled_salju_train.csv",usecols=["Awan3pm","Kelembaban3pm","BersaljuBesok"])
Nscalled
BersaljuBesok | Awan3pm | Kelembaban3pm | |
---|---|---|---|
0 | 0 | 7.0 | 17.0 |
1 | 0 | 6.0 | 48.0 |
2 | 0 | 7.0 | 52.0 |
3 | 0 | 1.0 | 31.0 |
4 | 0 | 0.0 | 18.0 |
... | ... | ... | ... |
40416 | 0 | 4.0 | 28.0 |
40417 | 0 | 2.0 | 59.0 |
40418 | 0 | 2.0 | 37.0 |
40419 | 1 | 6.0 | 70.0 |
40420 | 0 | 1.0 | 18.0 |
40421 rows × 3 columns
proses elbow
epsilon = list(range(5)) # Initialisation of epsilon
for k in range(1,6):
cluster = pd.read_csv("scaled_salju_train.csv", usecols=["Awan3pm", "Kelembaban3pm"], nrows=20000) # Read data file into 'cluster'
rows = cluster.shape[0] #contains the total number of rows in cluster data 'rows'
cols = cluster.shape[1] #contains the total number of columns in cluster data 'cols'
centroids = cluster.loc[np.random.randint(1,rows+1,k)] # Randomly initialises 'k' no. of centroids
centroids['new'] = list(range(1,k+1)) # New indices 1 to k are set for the dataframe 'centroids'
centroids.set_index('new',inplace = True)
d = np.random.rand(rows) # Initialization of 'd' which would contain the centroid number closest to data point
number_of_iterations = 15
tmp_eps = list(range(number_of_iterations)) # 'tmp_eps' is the sum of squares of distances between points and centroid of a cluster for each iteration
for i in range(0,number_of_iterations): # loop is for iterations
for j in range(0,rows):
d[j] = ((centroids - cluster.loc[j])**2).sum(axis = 1).idxmin()
cluster['centroid number'] = d #new column 'centroid number' is added to dataframe 'cluster'
MX = list(range(k)) # Initialisation of 'MX' which will store mean of 'x' values of each cluster
MY = list(range(k)) # Initialisation of 'MY' which will store mean of 'y' values of each cluster
for m in range(0,k):
MX[m] = cluster[cluster['centroid number'] == (m+1)]['Awan3pm'].mean()
MY[m] = cluster[cluster['centroid number'] == (m+1)]['Kelembaban3pm'].mean()
centroids.replace(list(centroids['Awan3pm']),MX,inplace = True) # The 'centroids' are replaced with the new values
centroids.replace(list(centroids['Kelembaban3pm']),MY,inplace = True) # The 'centroids' are replaced with the new values
z = list(range(k)) # Initialisation of z and centroid of each cluster.
for p in range(0,k): # loop calculates square of distances between data points and centroid of each cluster.
z[p] = ((cluster[cluster['centroid number'] == p+1][['Awan3pm','Kelembaban3pm']] - centroids.iloc[p])**2).values.sum()
tmp_eps[i] = sum(z)
epsilon[k-1] = tmp_eps[i] # The cost function after final iteration for each value of 'k' would be stored in epsilon.
%reset_selective -f centroids # The dataframe 'centroids' is reset.
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) <ipython-input-50-7cbf4fdf1520> in <module>() 12 for i in range(0,number_of_iterations): # loop is for iterations 13 for j in range(0,rows): ---> 14 d[j] = ((centroids - cluster.loc[j])**2).sum(axis = 1).idxmin() 15 cluster['centroid number'] = d #new column 'centroid number' is added to dataframe 'cluster' 16 MX = list(range(k)) # Initialisation of 'MX' which will store mean of 'x' values of each cluster /usr/local/lib/python3.7/dist-packages/pandas/core/ops/__init__.py in f(self, other, axis, level, fill_value) 653 if isinstance(other, ABCDataFrame): 654 # Another DataFrame --> 655 new_data = self._combine_frame(other, na_op, fill_value) 656 657 elif isinstance(other, ABCSeries): /usr/local/lib/python3.7/dist-packages/pandas/core/frame.py in _combine_frame(self, other, func, fill_value) 5868 return func(left, right) 5869 -> 5870 new_data = ops.dispatch_to_series(self, other, _arith_op) 5871 return new_data 5872 /usr/local/lib/python3.7/dist-packages/pandas/core/ops/__init__.py in dispatch_to_series(left, right, func, axis) 273 # _frame_arith_method_with_reindex 274 --> 275 bm = left._mgr.operate_blockwise(right._mgr, array_op) 276 return type(left)(bm) 277 /usr/local/lib/python3.7/dist-packages/pandas/core/internals/managers.py in operate_blockwise(self, other, array_op) 365 Apply array_op blockwise with another (aligned) BlockManager. 366 """ --> 367 return operate_blockwise(self, other, array_op) 368 369 def apply(self: T, f, align_keys=None, **kwargs) -> T: /usr/local/lib/python3.7/dist-packages/pandas/core/internals/ops.py in operate_blockwise(left, right, array_op) 36 lvals, rvals = _get_same_shape_values(blk, rblk, left_ea, right_ea) 37 ---> 38 res_values = array_op(lvals, rvals) 39 if left_ea and not right_ea and hasattr(res_values, "reshape"): 40 res_values = res_values.reshape(1, -1) /usr/local/lib/python3.7/dist-packages/pandas/core/ops/array_ops.py in arithmetic_op(left, right, op) 188 else: 189 with np.errstate(all="ignore"): --> 190 res_values = na_arithmetic_op(lvalues, rvalues, op) 191 192 return res_values /usr/local/lib/python3.7/dist-packages/pandas/core/ops/array_ops.py in na_arithmetic_op(left, right, op, is_cmp) 141 142 try: --> 143 result = expressions.evaluate(op, left, right) 144 except TypeError: 145 if is_cmp: /usr/local/lib/python3.7/dist-packages/pandas/core/computation/expressions.py in evaluate(op, a, b, use_numexpr) 231 use_numexpr = use_numexpr and _bool_arith_check(op_str, a, b) 232 if use_numexpr: --> 233 return _evaluate(op, op_str, a, b) # type: ignore 234 return _evaluate_standard(op, op_str, a, b) 235 /usr/local/lib/python3.7/dist-packages/pandas/core/computation/expressions.py in _evaluate_numexpr(op, op_str, a, b) 98 result = None 99 --> 100 if _can_use_numexpr(op, op_str, a, b, "evaluate"): 101 is_reversed = op.__name__.strip("_").startswith("r") 102 if is_reversed: /usr/local/lib/python3.7/dist-packages/pandas/core/computation/expressions.py in _can_use_numexpr(op, op_str, a, b, dtype_check) 74 75 # required min elements (otherwise we are adding overhead) ---> 76 if np.prod(a.shape) > _MIN_ELEMENTS: 77 # check for dtype compatibility 78 dtypes = set() <__array_function__ internals> in prod(*args, **kwargs) /usr/local/lib/python3.7/dist-packages/numpy/core/fromnumeric.py in prod(a, axis, dtype, out, keepdims, initial, where) 2998 """ 2999 return _wrapreduction(a, np.multiply, 'prod', axis, dtype, out, -> 3000 keepdims=keepdims, initial=initial, where=where) 3001 3002 /usr/local/lib/python3.7/dist-packages/numpy/core/fromnumeric.py in _wrapreduction(obj, ufunc, method, axis, dtype, out, **kwargs) 85 return reduction(axis=axis, out=out, **passkwargs) 86 ---> 87 return ufunc.reduce(obj, axis, dtype, out, **passkwargs) 88 89 KeyboardInterrupt:
print(epsilon) #cek nilai epsilon
[2448.384170676461, 895.2633686650431, 734.3420954055964, 3, 4]
k=list(range(1,6)) # range dari 'k'.
plt.figure(figsize=(10,8)) # size adjusted.
plt.xticks(fontsize=20) # size pada x-axis adjusted.
plt.yticks(fontsize=20) # size pada y-axis adjusted.
plt.plot(k,epsilon,'go--', linewidth=1.5, markersize=4) # Graph is plotted.
plt.xlabel('Nilai dari k',fontsize = 20) # x-axis.
plt.ylabel('Nilai dari epsilon',fontsize = 20) # y-axis.
Text(0, 0.5, 'Nilai dari epsilon')
Jadi nilai k berada di 2, karena proses terjadinya penekukan berada pada nilai 2
Mengkonversi dataframe ke dalam bentuk array agar lebih mudah diolah
#untuk data yang scale
Awan3pmScaled=scalled.Awan3pm #Scale Awan3pmScaled
Kelembaban3pmScaled=scalled.Kelembaban3pm #Scale Kelembaban3pmScaled
SaljuBesokScale=scalled.BersaljuBesok #Scale SaljuBesokScale
ScaleNew=[]
i=0
while i < len(scalled):
DT1 = [Awan3pmScaled[i], Kelembaban3pmScaled[i], SaljuBesokScale[i]]
ScaleNew.append(DT1)
i += 1
#untuk data yang unscale
Awan3pmNScaled=Nscalled.Awan3pm #Unscale Awan3pmNScaled
Kelembaban3pmNScaled=Nscalled.Kelembaban3pm #Unscale Kelembaban3pmNScaled
SaljuBesokNScale=Nscalled.BersaljuBesok #Unscale SaljuBesokNScale
NScaleNew=[]
i=0
while i < len(Nscalled):
DT2=[Awan3pmNScaled[i],Kelembaban3pmNScaled[i],SaljuBesokNScale[i]]
NScaleNew.append(DT2)
i += 1
untuk mencari nilai distance antara dua titik menggunakan fungsi manhattan distance
def Mdist(centroid, data): #manhattan distance
Md=abs(float(centroid[0]-data[0]))+abs(float(centroid[1]-data[1]))
return Md #return
menentukan centroid baru pada setiap iterasi
def Cent(cluster):
x=0 #initiate x
y=0 #initiate y
for i in range(len(cluster)):
x=x+cluster[i][0]
y=y+cluster[i][1]
avgx=x/len(cluster)
avgy=y/len(cluster)
centroid=[avgx,avgy]
return centroid
def kmeans(dataset,maxitr):
cent1=dataset[rd.randint(0,39744)]
cnet2=dataset[rd.randint(0,39744)]
minus=1
itr=0
while (minus!=0) and (itr<maxitr):
cls1=[]
cls2=[]
cent1old=cent1
cent2old=cnet2
for j in range(len(dataset)):
dist1=Mdist(cent1old,dataset[j])
dist2=Mdist(cent2old,dataset[j])
if dist1<dist2:
cls1.append(dataset[j])
else:
cls2.append(dataset[j])
cent1=Cent(cls1)
cnet2=Cent(cls2)
minus=(cent1[0]-cent1old[0])+(cent1[1]-cent1old[1])+(cnet2[0]-cent2old[0])+(cnet2[1]-cent2old[1])
itr+=1
centroids=[cent1, cnet2]
return centroids,cls1,cls2
SCent, scaledCluster1, scaledCluster2 = kmeans(ScaleNew, 100)
NSCent, unscaledCluster1, unscaledCluster2 = kmeans(NScaleNew, 100)
scaled
c1AwanScale = [] #Scale
c1KelembabanScale = [] #Scale
c2AwanScale = [] #Scale
c2KelembabanScale = [] #Scale
for k in range(len(scaledCluster1)):
c1AwanScale.append(scaledCluster1[k][0])
c1KelembabanScale.append(scaledCluster1[k][1])
for l in range(len(scaledCluster2)):
c2AwanScale.append(scaledCluster2[l][0])
c2KelembabanScale.append(scaledCluster2[l][1])
unscaled
c1AwanNScale = [] #Unscale
c1KelembabanNScale = [] #Unscale
c2AwanNScale = [] #Unscale
c2KelembabanNScale = [] #Unscale
for k in range(len(unscaledCluster1)):
c1AwanNScale.append(unscaledCluster1[k][0])
c1KelembabanNScale.append(unscaledCluster1[k][1])
for l in range(len(unscaledCluster2)):
c2AwanNScale.append(unscaledCluster2[l][0])
c2KelembabanNScale.append(unscaledCluster2[l][1])
cls1 dan cls2 ke dalam bentuk grafik
visualisasi clustering yang sudah discaling
plt.scatter(c1AwanScale,c1KelembabanScale,c='y',edgecolors='black',linewidth=0.20) #c1awankelembabanscale
plt.scatter(c2AwanScale,c2KelembabanScale,c='r',edgecolors='black',linewidth=0.20) #c2awankelembabanscale
plt.scatter(SCent[0][0],SCent[0][1],c ='g',edgecolors='magenta',linewidth=0.20) #scale centroid
plt.scatter(SCent[1][0],SCent[1][1],c ='g',edgecolors='magenta',linewidth=0.20) #scale centroid
plt.title('Scaled dataset') #title
plt.xlabel('Awan3pm') #xlabel
plt.ylabel('Kelembaban3pm') #ylabel
plt.show()
visualisasi clustering yang belum discaling
plt.scatter(c1AwanNScale,c1KelembabanNScale,c='y',edgecolors='black',linewidth=0.20) #c1awankelembabanNscale
plt.scatter(c2AwanNScale,c2KelembabanNScale,c='r',edgecolors='black',linewidth=0.20) #c2awankelembabanNscale
plt.scatter(NSCent[0][0],NSCent[0][1],c ='g',edgecolors='magenta',linewidth=0.20) #Nscale centroid
plt.scatter(NSCent[1][0],NSCent[1][1],c ='g',edgecolors='magenta',linewidth=0.20) #Nscale centroid
plt.title('Unscaled dataset') #title
plt.xlabel('Awan3pm') #xlabel
plt.ylabel('Kelembaban3pm') #ylabel
plt.show()
mengembalikan array dalam bentuk dataframe
c1AwanScale,c1KelembabanScale,scaledBersaljubesokc1=zip(*scaledCluster1) #cluster1scale
scaleCluster1=pd.DataFrame({'Awan3pm':c1AwanScale,'Kelembaban3pm':c1KelembabanScale},columns=['Awan3pm','Kelembaban3pm'])#cluster1scale
c2AwanScale,c2KelembabanScale,scaledBersaljubesokc2=zip(*scaledCluster2)#cluster2scale
scaleCluster2=pd.DataFrame({'Awan3pm':c2AwanScale,'Kelembaban3pm':c2KelembabanScale},columns=['Awan3pm','Kelembaban3pm']) #cluster2scale
scaleCluster1['Cluster']='Tidak Bersalju' #cluster1scale
scaleCluster2['Cluster']='Bersalju' #cluster2scale
Dataset yang sudah discaling dan clustering
scaleCluster=pd.concat([scaleCluster1,scaleCluster2],axis=0) #scaleCluster
scaleCluster
scaleCluster.groupby("Cluster").size() #scaleCluster size
c1AwanNScale,c1KelembabanNScale,unscaledBersaljubesokc1=zip(*unscaledCluster1) #cluster1Nscale
NscaleCluster1=pd.DataFrame({'Awan3pm':c1AwanNScale,'Kelembaban3pm':c1KelembabanNScale},columns=['Awan3pm','Kelembaban3pm']) #cluster1Nscale
c2AwanNScale,c2KelembabanNScale,unscaledBersaljubesokc2=zip(*unscaledCluster2) #cluster2Nscale
NscaleCluster2=pd.DataFrame({'Awan3pm':c2AwanNScale,'Kelembaban3pm':c2KelembabanNScale},columns=['Awan3pm','Kelembaban3pm']) #cluster2Nscale
NscaleCluster1['Cluster'] = 'Tidak Bersalju' #cluster1Nscale
NscaleCluster2['Cluster'] = 'Bersalju' #cluster2Nscale
Dataset yang belum discaling tapi sudah diclustering
NscaleCluster = pd.concat([NscaleCluster1, NscaleCluster2], axis=0) #NscaleCluster
NscaleCluster
NscaleCluster.groupby("Cluster").size() #NscaleCluster size
export data ke csv
scaleCluster.to_csv('ClusteringScalled.csv') #save to csv
NscaleCluster.to_csv('ClusteringUnscalled.csv') #save to csv
files.download('ClusteringScalled.csv') #download csv
files.download('ClusteringUnscalled.csv') #download csv