import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import random as rd
from pandas.plotting import scatter_matrix
import seaborn as sns
import missingno as msno
import warnings
from matplotlib import style
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
warnings.filterwarnings('ignore')
mpl.rcParams['figure.dpi'] = 100

!git clone https://github.com/andrewsihotang/tubesMLone.git

fatal: destination path 'tubesMLone' already exists and is not an empty directory.


data1 = pd.read_csv('/content/tubesMLone/salju/salju_train.csv')
data1.shape

(109095, 24)


data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 109095 entries, 0 to 109094
Data columns (total 24 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   id                        109095 non-null  int64  
 1   Tanggal                   109095 non-null  object 
 2   KodeLokasi                109095 non-null  object 
 3   SuhuMin                   107973 non-null  float64
 4   SuhuMax                   108166 non-null  float64
 5   Hujan                     106664 non-null  float64
 6   Penguapan                 62071 non-null   float64
 7   SinarMatahari             56716 non-null   float64
 8   ArahAnginTerkencang       101351 non-null  object 
 9   KecepatanAnginTerkencang  101399 non-null  float64
 10  ArahAngin9am              101172 non-null  object 
 11  ArahAngin3pm              105898 non-null  object 
 12  KecepatanAngin9am         107742 non-null  float64
 13  KecepatanAngin3pm         106792 non-null  float64
 14  Kelembaban9am             107093 non-null  float64
 15  Kelembaban3pm             105721 non-null  float64
 16  Tekanan9am                97768 non-null   float64
 17  Tekanan3pm                97787 non-null   float64
 18  Awan9am                   67251 non-null   float64
 19  Awan3pm                   64624 non-null   float64
 20  Suhu9am                   107755 non-null  float64
 21  Suhu3pm                   106397 non-null  float64
 22  BersaljuHariIni           106664 non-null  object 
 23  BersaljuBesok             106664 non-null  object 
dtypes: float64(16), int64(1), object(7)
memory usage: 20.0+ MB


data1.dtypes

id                            int64
Tanggal                      object
KodeLokasi                   object
SuhuMin                     float64
SuhuMax                     float64
Hujan                       float64
Penguapan                   float64
SinarMatahari               float64
ArahAnginTerkencang          object
KecepatanAnginTerkencang    float64
ArahAngin9am                 object
ArahAngin3pm                 object
KecepatanAngin9am           float64
KecepatanAngin3pm           float64
Kelembaban9am               float64
Kelembaban3pm               float64
Tekanan9am                  float64
Tekanan3pm                  float64
Awan9am                     float64
Awan3pm                     float64
Suhu9am                     float64
Suhu3pm                     float64
BersaljuHariIni              object
BersaljuBesok                object
dtype: object


data1.describe()


data1.head()


data1.isnull().sum()

id                              0
Tanggal                         0
KodeLokasi                      0
SuhuMin                      1122
SuhuMax                       929
Hujan                        2431
Penguapan                   47024
SinarMatahari               52379
ArahAnginTerkencang          7744
KecepatanAnginTerkencang     7696
ArahAngin9am                 7923
ArahAngin3pm                 3197
KecepatanAngin9am            1353
KecepatanAngin3pm            2303
Kelembaban9am                2002
Kelembaban3pm                3374
Tekanan9am                  11327
Tekanan3pm                  11308
Awan9am                     41844
Awan3pm                     44471
Suhu9am                      1340
Suhu3pm                      2698
BersaljuHariIni              2431
BersaljuBesok                2431
dtype: int64


#cek missing value
total=data1.isnull().sum().sort_values(ascending = False)
print(total)

SinarMatahari               52379
Penguapan                   47024
Awan3pm                     44471
Awan9am                     41844
Tekanan9am                  11327
Tekanan3pm                  11308
ArahAngin9am                 7923
ArahAnginTerkencang          7744
KecepatanAnginTerkencang     7696
Kelembaban3pm                3374
ArahAngin3pm                 3197
Suhu3pm                      2698
Hujan                        2431
BersaljuBesok                2431
BersaljuHariIni              2431
KecepatanAngin3pm            2303
Kelembaban9am                2002
KecepatanAngin9am            1353
Suhu9am                      1340
SuhuMin                      1122
SuhuMax                       929
KodeLokasi                      0
Tanggal                         0
id                              0
dtype: int64


data1.dropna(inplace=True)
data1.shape

(42411, 24)


#remove data null
# data1 = data1.dropna(how='any',subset=['Penguapan','Hujan','SuhuMin','SuhuMax'])
data1.isnull().sum()

id                          0
Tanggal                     0
KodeLokasi                  0
SuhuMin                     0
SuhuMax                     0
Hujan                       0
Penguapan                   0
SinarMatahari               0
ArahAnginTerkencang         0
KecepatanAnginTerkencang    0
ArahAngin9am                0
ArahAngin3pm                0
KecepatanAngin9am           0
KecepatanAngin3pm           0
Kelembaban9am               0
Kelembaban3pm               0
Tekanan9am                  0
Tekanan3pm                  0
Awan9am                     0
Awan3pm                     0
Suhu9am                     0
Suhu3pm                     0
BersaljuHariIni             0
BersaljuBesok               0
dtype: int64


cal = data1.dtypes==object
cal_col = data1.columns[cal].tolist()
data1[cal_col] = data1[cal_col].apply(lambda col: LabelEncoder().fit_transform(col))
data1[cal_col].head()
data1


plt.figure(figsize=(20,20))
sns.heatmap(data1.corr(),cmap='coolwarm',annot=True,linewidths=1);


baru = ["BersaljuBesok", "SuhuMin", "SuhuMax", "Suhu9am", "Suhu3pm", "Penguapan", "SinarMatahari", "Awan9am", "Awan3pm", "Kelembaban9am", "Kelembaban3pm"]
newData = data1[baru]
newData.head()
newData


mpl.rcParams['figure.dpi'] = 100
plt.figure(figsize=(10,50))
f, axes = plt.subplots(1, 4)
sns.boxplot(y=newData["SuhuMin"],ax=axes[0]) #SuhuMin
sns.boxplot(y=newData["SuhuMax"],ax=axes[1]) #SuhuMax
sns.boxplot(y=newData["Suhu9am"],ax=axes[2]) #Suhu9am
sns.boxplot(y=newData["Suhu3pm"],ax=axes[3]) #Suhu3pm
plt.subplots_adjust(wspace=8)

<Figure size 1000x5000 with 0 Axes>


mpl.rcParams['figure.dpi'] = 100
plt.figure(figsize=(10,50))
f, axes = plt.subplots(1, 6)
sns.boxplot(y=newData["Penguapan"],ax=axes[0]) #Penguapan
sns.boxplot(y=newData["SinarMatahari"],ax=axes[1]) #SinarMatahari
sns.boxplot(y=newData["Awan9am"],ax=axes[2]) #Awan9am
sns.boxplot(y=newData["Awan3pm"],ax=axes[3]) #Awan3pm
sns.boxplot(y=newData["Kelembaban9am"],ax=axes[4]) #Kelembaban9am
sns.boxplot(y=newData["Kelembaban3pm"],ax=axes[5]) #Kelembaban3pm
plt.subplots_adjust(wspace=8)

<Figure size 1000x5000 with 0 Axes>


newData.drop(newData[newData.SuhuMin < -5].index,inplace=True) #SuhuMin
newData.drop(newData[newData.SuhuMax > 40].index,inplace=True) #SuhuMax
newData.drop(newData[newData.Suhu9am > 37].index,inplace=True) #Suhu9am
newData.drop(newData[newData.Suhu3pm > 41].index,inplace=True) #Suhu3pm
newData.drop(newData[newData.Penguapan > 13].index,inplace=True) #Penguapan
newData.drop(newData[newData.Kelembaban9am < 20].index,inplace=True) #Kelembaban9am
newData


from google.colab import files

newData.to_csv('Nscaled_salju_train.csv')
files.download('Nscaled_salju_train.csv')


# !pip install --upgrade gupload

# from pydrive.auth import GoogleAuth
# from google.colab import auth

# # Authenticate and create the PyDrive client
# auth.authenticate_user()

# !gupload --to '13mwhSIIJAgHy-zx_E1bomFCH-0TcBImt' Nscaled_salju_train.csv


scalling=MinMaxScaler()
sclr=scalling.fit_transform(newData)
colNew=["BersaljuBesok", "SuhuMin", "SuhuMax", "Suhu9am", "Suhu3pm", "Penguapan","SinarMatahari", "Awan9am", "Awan3pm", "Kelembaban9am", "Kelembaban3pm"]
scalled=pd.DataFrame(sclr,columns=colNew)
scalled


from google.colab import files

scalled.to_csv('scaled_salju_train.csv')
files.download('scaled_salju_train.csv')


plt.figure(figsize=(10,10))
sns.heatmap(scalled.corr(),cmap='coolwarm',annot=True,linewidths=1);


scalled=pd.read_csv("scaled_salju_train.csv",usecols=["Awan3pm","Kelembaban3pm","BersaljuBesok"])
scalled


from google.colab import drive 
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


%cd ..
%cd /content/

/
/content


scalled=pd.read_csv("scaled_salju_train.csv",usecols=["Awan3pm","Kelembaban3pm","BersaljuBesok"])
scalled


Nscalled=pd.read_csv("Nscaled_salju_train.csv",usecols=["Awan3pm","Kelembaban3pm","BersaljuBesok"])
Nscalled


epsilon = list(range(5)) # Initialisation of epsilon
for k in range(1,6):
    cluster = pd.read_csv("scaled_salju_train.csv", usecols=["Awan3pm", "Kelembaban3pm"], nrows=20000) # Read data file into 'cluster'
    rows = cluster.shape[0] #contains the total number of rows in cluster data 'rows'
    cols = cluster.shape[1] #contains the total number of columns in cluster data 'cols'
    centroids = cluster.loc[np.random.randint(1,rows+1,k)] # Randomly initialises 'k' no. of centroids
    centroids['new'] = list(range(1,k+1)) # New indices 1 to k are set for the dataframe 'centroids'
    centroids.set_index('new',inplace = True) 
    d = np.random.rand(rows) # Initialization of 'd' which would contain the centroid number closest to data point
    number_of_iterations = 15
    tmp_eps = list(range(number_of_iterations)) # 'tmp_eps' is the sum of squares of distances between points and centroid of a cluster for each iteration
    for i in range(0,number_of_iterations): # loop is for iterations
          for j in range(0,rows):
              d[j] = ((centroids - cluster.loc[j])**2).sum(axis = 1).idxmin()
          cluster['centroid number'] = d #new column 'centroid number' is added to dataframe 'cluster'
          MX = list(range(k)) # Initialisation of 'MX' which will store mean of 'x' values of each cluster
          MY = list(range(k)) # Initialisation of 'MY' which will store mean of 'y' values of each cluster
          for m in range(0,k):
              MX[m] = cluster[cluster['centroid number'] == (m+1)]['Awan3pm'].mean()
              MY[m] = cluster[cluster['centroid number'] == (m+1)]['Kelembaban3pm'].mean()
          centroids.replace(list(centroids['Awan3pm']),MX,inplace = True) # The 'centroids' are replaced with the new values
          centroids.replace(list(centroids['Kelembaban3pm']),MY,inplace = True) # The 'centroids' are replaced with the new values
          z = list(range(k)) # Initialisation of z and centroid of each cluster.
          for p in range(0,k): # loop calculates square of distances between data points and centroid of each cluster.
              z[p] = ((cluster[cluster['centroid number'] == p+1][['Awan3pm','Kelembaban3pm']] - centroids.iloc[p])**2).values.sum()
          tmp_eps[i] = sum(z) 
          epsilon[k-1] = tmp_eps[i] # The cost function after final iteration for each value of 'k' would be stored in epsilon.
    %reset_selective -f centroids # The dataframe 'centroids' is reset.

---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-50-7cbf4fdf1520> in <module>()
     12     for i in range(0,number_of_iterations): # loop is for iterations
     13           for j in range(0,rows):
---> 14               d[j] = ((centroids - cluster.loc[j])**2).sum(axis = 1).idxmin()
     15           cluster['centroid number'] = d #new column 'centroid number' is added to dataframe 'cluster'
     16           MX = list(range(k)) # Initialisation of 'MX' which will store mean of 'x' values of each cluster

/usr/local/lib/python3.7/dist-packages/pandas/core/ops/__init__.py in f(self, other, axis, level, fill_value)
    653         if isinstance(other, ABCDataFrame):
    654             # Another DataFrame
--> 655             new_data = self._combine_frame(other, na_op, fill_value)
    656 
    657         elif isinstance(other, ABCSeries):

/usr/local/lib/python3.7/dist-packages/pandas/core/frame.py in _combine_frame(self, other, func, fill_value)
   5868                 return func(left, right)
   5869 
-> 5870         new_data = ops.dispatch_to_series(self, other, _arith_op)
   5871         return new_data
   5872 

/usr/local/lib/python3.7/dist-packages/pandas/core/ops/__init__.py in dispatch_to_series(left, right, func, axis)
    273         #  _frame_arith_method_with_reindex
    274 
--> 275         bm = left._mgr.operate_blockwise(right._mgr, array_op)
    276         return type(left)(bm)
    277 

/usr/local/lib/python3.7/dist-packages/pandas/core/internals/managers.py in operate_blockwise(self, other, array_op)
    365         Apply array_op blockwise with another (aligned) BlockManager.
    366         """
--> 367         return operate_blockwise(self, other, array_op)
    368 
    369     def apply(self: T, f, align_keys=None, **kwargs) -> T:

/usr/local/lib/python3.7/dist-packages/pandas/core/internals/ops.py in operate_blockwise(left, right, array_op)
     36             lvals, rvals = _get_same_shape_values(blk, rblk, left_ea, right_ea)
     37 
---> 38             res_values = array_op(lvals, rvals)
     39             if left_ea and not right_ea and hasattr(res_values, "reshape"):
     40                 res_values = res_values.reshape(1, -1)

/usr/local/lib/python3.7/dist-packages/pandas/core/ops/array_ops.py in arithmetic_op(left, right, op)
    188     else:
    189         with np.errstate(all="ignore"):
--> 190             res_values = na_arithmetic_op(lvalues, rvalues, op)
    191 
    192     return res_values

/usr/local/lib/python3.7/dist-packages/pandas/core/ops/array_ops.py in na_arithmetic_op(left, right, op, is_cmp)
    141 
    142     try:
--> 143         result = expressions.evaluate(op, left, right)
    144     except TypeError:
    145         if is_cmp:

/usr/local/lib/python3.7/dist-packages/pandas/core/computation/expressions.py in evaluate(op, a, b, use_numexpr)
    231         use_numexpr = use_numexpr and _bool_arith_check(op_str, a, b)
    232         if use_numexpr:
--> 233             return _evaluate(op, op_str, a, b)  # type: ignore
    234     return _evaluate_standard(op, op_str, a, b)
    235 

/usr/local/lib/python3.7/dist-packages/pandas/core/computation/expressions.py in _evaluate_numexpr(op, op_str, a, b)
     98     result = None
     99 
--> 100     if _can_use_numexpr(op, op_str, a, b, "evaluate"):
    101         is_reversed = op.__name__.strip("_").startswith("r")
    102         if is_reversed:

/usr/local/lib/python3.7/dist-packages/pandas/core/computation/expressions.py in _can_use_numexpr(op, op_str, a, b, dtype_check)
     74 
     75         # required min elements (otherwise we are adding overhead)
---> 76         if np.prod(a.shape) > _MIN_ELEMENTS:
     77             # check for dtype compatibility
     78             dtypes = set()

<__array_function__ internals> in prod(*args, **kwargs)

/usr/local/lib/python3.7/dist-packages/numpy/core/fromnumeric.py in prod(a, axis, dtype, out, keepdims, initial, where)
   2998     """
   2999     return _wrapreduction(a, np.multiply, 'prod', axis, dtype, out,
-> 3000                           keepdims=keepdims, initial=initial, where=where)
   3001 
   3002 

/usr/local/lib/python3.7/dist-packages/numpy/core/fromnumeric.py in _wrapreduction(obj, ufunc, method, axis, dtype, out, **kwargs)
     85                 return reduction(axis=axis, out=out, **passkwargs)
     86 
---> 87     return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
     88 
     89 

KeyboardInterrupt:


print(epsilon) #cek nilai epsilon

[2448.384170676461, 895.2633686650431, 734.3420954055964, 3, 4]


k=list(range(1,6)) # range dari 'k'.
plt.figure(figsize=(10,8)) # size adjusted.
plt.xticks(fontsize=20) # size pada x-axis adjusted.
plt.yticks(fontsize=20) # size pada y-axis adjusted.
plt.plot(k,epsilon,'go--', linewidth=1.5, markersize=4) # Graph is plotted.
plt.xlabel('Nilai dari k',fontsize = 20) # x-axis.
plt.ylabel('Nilai dari epsilon',fontsize = 20) # y-axis.

Text(0, 0.5, 'Nilai dari epsilon')


#untuk data yang scale
Awan3pmScaled=scalled.Awan3pm #Scale Awan3pmScaled
Kelembaban3pmScaled=scalled.Kelembaban3pm #Scale Kelembaban3pmScaled
SaljuBesokScale=scalled.BersaljuBesok #Scale SaljuBesokScale
ScaleNew=[]
i=0
while i < len(scalled):
      DT1 = [Awan3pmScaled[i], Kelembaban3pmScaled[i], SaljuBesokScale[i]]
      ScaleNew.append(DT1)
      i += 1


#untuk data yang unscale
Awan3pmNScaled=Nscalled.Awan3pm #Unscale Awan3pmNScaled
Kelembaban3pmNScaled=Nscalled.Kelembaban3pm #Unscale Kelembaban3pmNScaled
SaljuBesokNScale=Nscalled.BersaljuBesok #Unscale SaljuBesokNScale
NScaleNew=[]
i=0
while i < len(Nscalled):
      DT2=[Awan3pmNScaled[i],Kelembaban3pmNScaled[i],SaljuBesokNScale[i]]
      NScaleNew.append(DT2)
      i += 1


def Mdist(centroid, data): #manhattan distance
      Md=abs(float(centroid[0]-data[0]))+abs(float(centroid[1]-data[1])) 
      return Md #return


def Cent(cluster):
      x=0 #initiate x
      y=0 #initiate y
      for i in range(len(cluster)): 
            x=x+cluster[i][0]
            y=y+cluster[i][1]
      avgx=x/len(cluster)
      avgy=y/len(cluster)
      centroid=[avgx,avgy]
      return centroid


def kmeans(dataset,maxitr):
      cent1=dataset[rd.randint(0,39744)]
      cnet2=dataset[rd.randint(0,39744)]
      minus=1
      itr=0
      while (minus!=0) and (itr<maxitr):
          cls1=[]
          cls2=[]
          cent1old=cent1
          cent2old=cnet2
          for j in range(len(dataset)):
              dist1=Mdist(cent1old,dataset[j])
              dist2=Mdist(cent2old,dataset[j])
              if dist1<dist2:
                  cls1.append(dataset[j])
              else:
                  cls2.append(dataset[j])
          cent1=Cent(cls1)
          cnet2=Cent(cls2)
          minus=(cent1[0]-cent1old[0])+(cent1[1]-cent1old[1])+(cnet2[0]-cent2old[0])+(cnet2[1]-cent2old[1])
          itr+=1
      centroids=[cent1, cnet2]
      return centroids,cls1,cls2


SCent, scaledCluster1, scaledCluster2 = kmeans(ScaleNew, 100)
NSCent, unscaledCluster1, unscaledCluster2 = kmeans(NScaleNew, 100)


c1AwanScale = [] #Scale
c1KelembabanScale = [] #Scale
c2AwanScale = [] #Scale
c2KelembabanScale = [] #Scale
for k in range(len(scaledCluster1)):
      c1AwanScale.append(scaledCluster1[k][0])
      c1KelembabanScale.append(scaledCluster1[k][1])
for l in range(len(scaledCluster2)):
      c2AwanScale.append(scaledCluster2[l][0])
      c2KelembabanScale.append(scaledCluster2[l][1])


c1AwanNScale = [] #Unscale
c1KelembabanNScale = [] #Unscale
c2AwanNScale = [] #Unscale
c2KelembabanNScale = [] #Unscale
for k in range(len(unscaledCluster1)):
      c1AwanNScale.append(unscaledCluster1[k][0])
      c1KelembabanNScale.append(unscaledCluster1[k][1])
for l in range(len(unscaledCluster2)):
      c2AwanNScale.append(unscaledCluster2[l][0])
      c2KelembabanNScale.append(unscaledCluster2[l][1])


plt.scatter(c1AwanScale,c1KelembabanScale,c='y',edgecolors='black',linewidth=0.20) #c1awankelembabanscale
plt.scatter(c2AwanScale,c2KelembabanScale,c='r',edgecolors='black',linewidth=0.20) #c2awankelembabanscale
plt.scatter(SCent[0][0],SCent[0][1],c ='g',edgecolors='magenta',linewidth=0.20) #scale centroid
plt.scatter(SCent[1][0],SCent[1][1],c ='g',edgecolors='magenta',linewidth=0.20) #scale centroid
plt.title('Scaled dataset') #title
plt.xlabel('Awan3pm') #xlabel
plt.ylabel('Kelembaban3pm') #ylabel
plt.show()


plt.scatter(c1AwanNScale,c1KelembabanNScale,c='y',edgecolors='black',linewidth=0.20) #c1awankelembabanNscale
plt.scatter(c2AwanNScale,c2KelembabanNScale,c='r',edgecolors='black',linewidth=0.20) #c2awankelembabanNscale
plt.scatter(NSCent[0][0],NSCent[0][1],c ='g',edgecolors='magenta',linewidth=0.20) #Nscale centroid
plt.scatter(NSCent[1][0],NSCent[1][1],c ='g',edgecolors='magenta',linewidth=0.20) #Nscale centroid
plt.title('Unscaled dataset') #title
plt.xlabel('Awan3pm') #xlabel
plt.ylabel('Kelembaban3pm') #ylabel
plt.show()


c1AwanScale,c1KelembabanScale,scaledBersaljubesokc1=zip(*scaledCluster1) #cluster1scale
scaleCluster1=pd.DataFrame({'Awan3pm':c1AwanScale,'Kelembaban3pm':c1KelembabanScale},columns=['Awan3pm','Kelembaban3pm'])#cluster1scale
c2AwanScale,c2KelembabanScale,scaledBersaljubesokc2=zip(*scaledCluster2)#cluster2scale
scaleCluster2=pd.DataFrame({'Awan3pm':c2AwanScale,'Kelembaban3pm':c2KelembabanScale},columns=['Awan3pm','Kelembaban3pm']) #cluster2scale


scaleCluster1['Cluster']='Tidak Bersalju' #cluster1scale
scaleCluster2['Cluster']='Bersalju' #cluster2scale


scaleCluster=pd.concat([scaleCluster1,scaleCluster2],axis=0) #scaleCluster
scaleCluster
scaleCluster.groupby("Cluster").size() #scaleCluster size


c1AwanNScale,c1KelembabanNScale,unscaledBersaljubesokc1=zip(*unscaledCluster1) #cluster1Nscale
NscaleCluster1=pd.DataFrame({'Awan3pm':c1AwanNScale,'Kelembaban3pm':c1KelembabanNScale},columns=['Awan3pm','Kelembaban3pm']) #cluster1Nscale
c2AwanNScale,c2KelembabanNScale,unscaledBersaljubesokc2=zip(*unscaledCluster2) #cluster2Nscale
NscaleCluster2=pd.DataFrame({'Awan3pm':c2AwanNScale,'Kelembaban3pm':c2KelembabanNScale},columns=['Awan3pm','Kelembaban3pm']) #cluster2Nscale


NscaleCluster1['Cluster'] = 'Tidak Bersalju' #cluster1Nscale
NscaleCluster2['Cluster'] = 'Bersalju' #cluster2Nscale


NscaleCluster = pd.concat([NscaleCluster1, NscaleCluster2], axis=0) #NscaleCluster
NscaleCluster
NscaleCluster.groupby("Cluster").size() #NscaleCluster size


scaleCluster.to_csv('ClusteringScalled.csv') #save to csv
NscaleCluster.to_csv('ClusteringUnscalled.csv') #save to csv


files.download('ClusteringScalled.csv') #download csv


files.download('ClusteringUnscalled.csv') #download csv

	id	SuhuMin	SuhuMax	Hujan	Penguapan	SinarMatahari	KecepatanAnginTerkencang	KecepatanAngin9am	KecepatanAngin3pm	Kelembaban9am	Kelembaban3pm	Tekanan9am	Tekanan3pm	Awan9am	Awan3pm	Suhu9am	Suhu3pm
count	109095.000000	107973.000000	108166.000000	106664.000000	62071.000000	56716.000000	101399.000000	107742.000000	106792.000000	107093.000000	105721.000000	97768.000000	97787.000000	67251.000000	64624.000000	107755.000000	106397.000000
mean	54548.000000	12.196183	23.214819	2.385005	5.462440	7.599527	40.032002	14.052115	18.677579	68.895577	51.567626	1017.647080	1015.253117	4.450893	4.516140	16.991391	21.672771
std	31493.158146	6.389419	7.106596	8.588155	4.201638	3.789042	13.617554	8.926092	8.830199	18.995528	20.791573	7.117338	7.047875	2.884566	2.718738	6.477602	6.922833
min	1.000000	-8.500000	-4.800000	0.000000	0.000000	0.000000	7.000000	0.000000	0.000000	0.000000	0.000000	980.500000	977.100000	0.000000	0.000000	-7.200000	-5.400000
25%	27274.500000	7.600000	17.900000	0.000000	2.600000	4.800000	31.000000	7.000000	13.000000	57.000000	37.000000	1012.900000	1010.400000	1.000000	2.000000	12.300000	16.600000
50%	54548.000000	12.000000	22.600000	0.000000	4.800000	8.400000	39.000000	13.000000	19.000000	70.000000	52.000000	1017.600000	1015.200000	5.000000	5.000000	16.700000	21.100000
75%	81821.500000	16.800000	28.200000	0.800000	7.400000	10.600000	48.000000	19.000000	24.000000	83.000000	66.000000	1022.400000	1020.000000	7.000000	7.000000	21.600000	26.400000
max	109095.000000	33.900000	47.300000	371.000000	145.000000	14.300000	135.000000	130.000000	87.000000	100.000000	100.000000	1041.000000	1039.600000	9.000000	9.000000	40.200000	46.700000

	id	Tanggal	KodeLokasi	SuhuMin	SuhuMax	Hujan	Penguapan	SinarMatahari	ArahAnginTerkencang	KecepatanAnginTerkencang	ArahAngin9am	ArahAngin3pm	KecepatanAngin9am	KecepatanAngin3pm	Kelembaban9am	Kelembaban3pm	Tekanan9am	Tekanan3pm	Awan9am	Awan3pm	Suhu9am	Suhu3pm	BersaljuHariIni	BersaljuBesok
0	1	01/06/2014	C4	10.4	15.5	4.8	NaN	NaN	WSW	24.0	NaN	WSW	0.0	13.0	78.0	76.0	1020.1	1018.5	NaN	NaN	13.1	15.0	Ya	Tidak
1	2	15/07/2014	C10	9.0	17.0	8.0	2.6	7.4	NaN	NaN	SW	WNW	13.0	20.0	80.0	61.0	1015.2	1014.6	7.0	5.0	11.9	15.5	Ya	Ya
2	3	16/02/2011	C46	18.2	32.0	0.0	NaN	NaN	ESE	44.0	SE	SE	15.0	26.0	62.0	42.0	NaN	NaN	NaN	NaN	23.8	29.6	Tidak	Tidak
3	4	08/08/2012	C36	7.3	24.5	0.0	8.4	10.4	SSW	54.0	N	SW	13.0	19.0	25.0	17.0	1019.2	1016.9	1.0	7.0	15.3	23.2	Tidak	Tidak
4	5	29/10/2016	C7	5.9	20.3	0.0	3.6	12.6	N	37.0	NNW	ESE	22.0	19.0	55.0	48.0	1019.7	1014.7	2.0	6.0	12.4	18.1	Tidak	Tidak

	id	Tanggal	KodeLokasi	SuhuMin	SuhuMax	Hujan	Penguapan	SinarMatahari	ArahAnginTerkencang	KecepatanAnginTerkencang	ArahAngin9am	ArahAngin3pm	KecepatanAngin9am	KecepatanAngin3pm	Kelembaban9am	Kelembaban3pm	Tekanan9am	Tekanan3pm	Awan9am	Awan3pm	Suhu9am	Suhu3pm	BersaljuHariIni	BersaljuBesok
3	4	849	15	7.3	24.5	0.0	8.4	10.4	11	54.0	3	12	13.0	19.0	25.0	17.0	1019.2	1016.9	1.0	7.0	15.3	23.2	0	0
4	5	3188	23	5.9	20.3	0.0	3.6	12.6	3	37.0	6	2	22.0	19.0	55.0	48.0	1019.7	1014.7	2.0	6.0	12.4	18.1	0	0
5	6	1271	2	14.4	21.8	0.0	3.2	4.4	12	39.0	12	11	19.0	20.0	63.0	52.0	1016.1	1012.5	7.0	7.0	16.7	21.1	0	0
6	7	1380	15	7.7	18.7	0.2	5.6	9.7	14	46.0	7	14	19.0	28.0	69.0	31.0	1011.3	1008.8	1.0	1.0	11.3	18.3	0	0
8	9	1903	24	18.4	35.3	0.0	10.0	12.5	1	33.0	0	2	11.0	13.0	44.0	18.0	1017.9	1013.4	0.0	0.0	23.7	34.9	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
109080	109081	232	18	16.8	34.1	0.0	12.8	10.3	1	85.0	1	1	30.0	37.0	48.0	28.0	1013.4	1009.2	1.0	4.0	25.6	33.0	0	0
109082	109083	2396	4	8.7	19.0	0.0	1.4	9.6	13	24.0	13	9	22.0	11.0	81.0	59.0	1024.6	1022.3	2.0	2.0	10.8	16.5	0	0
109088	109089	1877	6	14.3	26.2	0.0	8.0	12.6	5	50.0	7	4	13.0	33.0	51.0	37.0	1019.2	1015.6	0.0	2.0	21.1	25.5	0	0
109090	109091	3309	17	20.1	23.7	0.0	7.2	8.9	2	43.0	9	2	24.0	26.0	74.0	70.0	1019.3	1017.6	4.0	6.0	22.0	22.1	0	1
109093	109094	1696	1	10.8	29.8	0.0	7.8	11.2	0	48.0	2	9	13.0	26.0	35.0	18.0	1020.0	1015.8	0.0	1.0	21.7	29.2	0	0

	BersaljuBesok	SuhuMin	SuhuMax	Suhu9am	Suhu3pm	Penguapan	SinarMatahari	Awan9am	Awan3pm	Kelembaban9am	Kelembaban3pm
3	0	7.3	24.5	15.3	23.2	8.4	10.4	1.0	7.0	25.0	17.0
4	0	5.9	20.3	12.4	18.1	3.6	12.6	2.0	6.0	55.0	48.0
5	0	14.4	21.8	16.7	21.1	3.2	4.4	7.0	7.0	63.0	52.0
6	0	7.7	18.7	11.3	18.3	5.6	9.7	1.0	1.0	69.0	31.0
8	0	18.4	35.3	23.7	34.9	10.0	12.5	0.0	0.0	44.0	18.0
...	...	...	...	...	...	...	...	...	...	...	...
109080	0	16.8	34.1	25.6	33.0	12.8	10.3	1.0	4.0	48.0	28.0
109082	0	8.7	19.0	10.8	16.5	1.4	9.6	2.0	2.0	81.0	59.0
109088	0	14.3	26.2	21.1	25.5	8.0	12.6	0.0	2.0	51.0	37.0
109090	1	20.1	23.7	22.0	22.1	7.2	8.9	4.0	6.0	74.0	70.0
109093	0	10.8	29.8	21.7	29.2	7.8	11.2	0.0	1.0	35.0	18.0

	BersaljuBesok	SuhuMin	SuhuMax	Suhu9am	Suhu3pm	Penguapan	SinarMatahari	Awan9am	Awan3pm	Kelembaban9am	Kelembaban3pm
3	0	7.3	24.5	15.3	23.2	8.4	10.4	1.0	7.0	25.0	17.0
4	0	5.9	20.3	12.4	18.1	3.6	12.6	2.0	6.0	55.0	48.0
5	0	14.4	21.8	16.7	21.1	3.2	4.4	7.0	7.0	63.0	52.0
6	0	7.7	18.7	11.3	18.3	5.6	9.7	1.0	1.0	69.0	31.0
8	0	18.4	35.3	23.7	34.9	10.0	12.5	0.0	0.0	44.0	18.0
...	...	...	...	...	...	...	...	...	...	...	...
109080	0	16.8	34.1	25.6	33.0	12.8	10.3	1.0	4.0	48.0	28.0
109082	0	8.7	19.0	10.8	16.5	1.4	9.6	2.0	2.0	81.0	59.0
109088	0	14.3	26.2	21.1	25.5	8.0	12.6	0.0	2.0	51.0	37.0
109090	1	20.1	23.7	22.0	22.1	7.2	8.9	4.0	6.0	74.0	70.0
109093	0	10.8	29.8	21.7	29.2	7.8	11.2	0.0	1.0	35.0	18.0

PREPROCESSING¶

Missing Value¶

Outlier¶

CLUSTERING¶

	BersaljuBesok	SuhuMin	SuhuMax	Suhu9am	Suhu3pm	Penguapan	SinarMatahari	Awan9am	Awan3pm	Kelembaban9am	Kelembaban3pm
0	0.0	0.352601	0.540059	0.431267	0.535411	0.646154	0.727273	0.125	0.777778	0.0625	0.161616
1	0.0	0.312139	0.415430	0.353100	0.390935	0.276923	0.881119	0.250	0.666667	0.4375	0.474747
2	0.0	0.557803	0.459941	0.469003	0.475921	0.246154	0.307692	0.875	0.777778	0.5375	0.515152
3	0.0	0.364162	0.367953	0.323450	0.396601	0.430769	0.678322	0.125	0.111111	0.6125	0.303030
4	0.0	0.673410	0.860534	0.657682	0.866856	0.769231	0.874126	0.000	0.000000	0.3000	0.171717
...	...	...	...	...	...	...	...	...	...	...	...
40416	0.0	0.627168	0.824926	0.708895	0.813031	0.984615	0.720280	0.125	0.444444	0.3500	0.272727
40417	0.0	0.393064	0.376855	0.309973	0.345609	0.107692	0.671329	0.250	0.222222	0.7625	0.585859
40418	0.0	0.554913	0.590504	0.587601	0.600567	0.615385	0.881119	0.000	0.222222	0.3875	0.363636
40419	1.0	0.722543	0.516320	0.611860	0.504249	0.553846	0.622378	0.500	0.666667	0.6750	0.696970
40420	0.0	0.453757	0.697329	0.603774	0.705382	0.600000	0.783217	0.000	0.111111	0.1875	0.171717