목차
data
1. 라이브러리
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
2. EDA
unique(), dropna(), value_counts(), isnull(), reset_index()
1) encoding: categorical to numeric
penguins_subset['sex'] = penguins_subset['sex'].str.upper()
penguins_subset=pd.get_dummies(penguins_subset, drop_first=True, columns=['sex'])
2) drop
penguins_subset = penguins_subset.drop(['island'], axis=1)
3. Scale
1) variable X
X= penguins_subset.drop(['species'], axis=1)
2) StandardScaler()
x_scaled= StandardScaler().fit_transform(X)
4. 데이터 모델
k-means 모델로 학습시킨다.
kmeans=KMeans(n_clusters=3, random_state=42)
kmeans.fit(x_scaled)
kmeans가 결정한 군집이다.
kmeans.labels_
5. 평가
1) Inertia
평가를 위해 inertia를 계산해봤다.
kmeans.inertia_
578.8284278107235
이번에는 군집을 2-10까지 바꿔가며 inertia가 어떻게 바뀌는지 살펴보자.
num_cluster=[i for i in range(2, 11)]
def kmeans_inertia(num_clusters, x_vals):
inertia=[]
for num in num_clusters:
kms= KMeans(n_clusters=num, random_state=42)
kms.fit(x_vals)
inertia.append(kms.inertia_)
return inertia
inertia = kmeans_inertia(num_cluster, x_scaled)
[885.6224143652249,
578.8284278107235,
386.14534424773285,
284.5464837898288,
217.92858573807678,
201.39287843423264,
186.82270634899209,
173.47283154242746,
164.55854201979943]
시각화하자.
plot=sns.lineplot(x=num_cluster, y=inertia)
plot.set_xlabel("number of cluster")
plot.set_ylabel("Inertia")
2) silhouette score
kmeans_sil_score= silhouette_score(x_scaled, kmeans.labels_)
kmeans_sil_score
0.45101024097188364
군집 개수에 따라 실루엣점수가 어떻게 달라지는지 확인하자.
num_clusters=[i for i in range(2,11)]
def kmeans_sil(num_clusters, x_vals):
sil_score=[]
for num in num_clusters:
kms= KMeans(n_clusters=num, random_state=42)
kms.fit(x_vals)
sil_score.append(silhouette_score(x_vals, kms.labels_))
return sil_score
sil_score= kmeans_sil(num_clusters, x_scaled)
sil_score
[0.44398088353055243,
0.45101024097188364,
0.5080140996630784,
0.519998574860868,
0.5263224884981607,
0.47774022332151733,
0.42680523270292947,
0.35977478703657334,
0.3589883410610364]
시각화하자.
plot=sns.lineplot(x=num_clusters, y=sil_score)
plot.set_xlabel("clusters")
plot.set_ylabel("silhouette score")
6. 점검
cluster의 개수가 6개일 때 최고의 성능을 보이므로, 6이라는 가정으로 kmeans 모델을 만든다.
kmeans6=KMeans(n_clusters=6, random_state=42)
kmeans6.fit(x_scaled)
레이블 확인하기
np.unique(kmeans6.labels_)
0부터 5까지 나온다.
cluster 열 따로 만들기
penguins_subset['cluster']= kmeans6.labels_
원래 종류와 만들어진 cluster이 얼마나 다른지 알기 위해 그룹짓기
penguins_subset.groupby(by=['cluster', 'species']).size()
표 만들기
penguins_subset.groupby(by=['cluster', 'species']).size().plot.bar(title='Clusters differentiated by species',
figsize=(6, 5),
ylabel='Size',
xlabel='(Cluster, Species)');