파이썬을 활용한 이커머스 데이터분석_강의를 듣고 따라했던 코딩과 요점을 정리하였다.
- 출처: fast campus
Chapter.07 고객 분류 (Kmeans)¶
분석의 목적¶
Kmeans Clustering을 활용하여, 데이터 기반의 고객 Segment 분류
쇼핑몰 고객 데이터
배우기 앞서, 예제를 다뤄본다. kmeans에서 y값은 없는 값이지만 이 예제에서는 y를 넣어서 학습해본다.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_blobs
make_blobs(n_samples = 200, n_features = 200, centers = 4, random_state = 100)
(array([[ 1.20060988, -4.11602225, -1.73066167, ..., 7.17588741, -6.60857575, -1.91594874], [ 1.77915078, -4.24786567, -2.30132999, ..., 6.02667345, -5.45367431, -3.46164153], [ 0.70237751, -4.95952703, -0.70912414, ..., 8.19837411, -6.84799295, -1.53790267], ..., [ -2.11081239, 4.13898664, -1.84492092, ..., -11.10319888, 1.13777799, -0.18317904], [ 0.72859818, -4.66721381, -2.93532096, ..., 8.8968047 , -6.49876746, -2.37012576], [ 0.75919257, -3.29741063, -0.49985464, ..., 7.87040714, -4.46642633, -4.41132106]]), array([0, 0, 0, 3, 2, 2, 3, 3, 2, 0, 3, 1, 3, 3, 2, 3, 3, 3, 1, 0, 3, 1, 3, 2, 0, 3, 1, 1, 0, 2, 0, 3, 1, 0, 0, 0, 0, 0, 3, 2, 2, 1, 1, 2, 3, 2, 3, 3, 2, 2, 2, 1, 1, 0, 2, 3, 3, 0, 0, 3, 2, 2, 1, 1, 1, 1, 3, 3, 0, 1, 3, 1, 3, 1, 0, 1, 2, 1, 2, 2, 2, 2, 1, 3, 3, 2, 2, 3, 0, 3, 2, 2, 1, 3, 0, 2, 1, 2, 1, 0, 3, 1, 2, 0, 0, 1, 1, 3, 2, 1, 1, 1, 0, 3, 1, 0, 2, 3, 1, 3, 3, 1, 3, 2, 0, 0, 3, 0, 1, 2, 3, 1, 3, 2, 2, 0, 1, 0, 2, 3, 1, 3, 3, 1, 2, 0, 2, 2, 2, 2, 1, 3, 0, 1, 0, 2, 2, 0, 0, 2, 2, 1, 0, 1, 0, 1, 1, 1, 2, 3, 0, 3, 0, 0, 3, 0, 1, 2, 0, 0, 2, 1, 3, 0, 0, 3, 3, 2, 3, 0, 1, 1, 2, 2, 0, 0, 1, 1, 0, 0]))
data = make_blobs(n_samples = 200, n_features = 2, centers = 4, random_state = 100)
data
(array([[-1.04541152e+01, -7.62125255e+00], [-1.00696483e+01, -6.35203699e+00], [-9.72925240e+00, -8.44434843e+00], [-9.53601173e+00, -8.04924975e+00], [-3.00736857e+00, 5.70163667e+00], [-1.00825197e+01, -7.06157008e+00], [ 1.48356884e+00, 6.54127044e+00], [ 1.36630977e+00, 7.30084118e+00], [-6.15050479e-01, 7.65521577e+00], [-2.13385599e-01, 7.84779827e+00], [ 3.36818459e-01, -3.40287961e+00], [-1.37722038e+00, 6.91773657e+00], [-1.08726796e+00, 5.80147972e+00], [-1.63584937e+00, 8.88579630e+00], [-7.62592530e-01, 7.32519908e+00], [-1.93336328e+00, 5.70953908e+00], [-9.51418426e+00, -7.24137223e+00], [-9.86650380e-01, 6.87917724e+00], [-9.37825374e+00, -7.94292047e+00], [ 2.66345400e-01, 7.40859703e+00], [-1.20385333e+01, -6.80941325e+00], [-9.20343837e+00, -8.85252239e+00], [ 4.10071850e-01, -3.99744881e+00], [ 2.25174587e+00, 6.49587933e+00], [ 7.92075370e-01, -4.42865470e+00], [ 5.41860777e-01, -4.37693628e+00], [-7.56585997e-01, 5.28608375e+00], [-8.95266345e+00, -8.55198023e+00], [-1.68442974e+00, 7.91278699e+00], [ 1.48113772e+00, -3.69640708e+00], [ 8.75413399e-01, -5.04555103e+00], [ 3.00989843e+00, 7.05349904e+00], [ 9.76962304e-01, -3.92480271e+00], [-9.94328955e+00, -6.73708455e+00], [-9.79380030e+00, -5.02918555e+00], [-1.37939973e+00, 7.84488351e+00], [-2.10521664e+00, -4.39929502e+00], [ 4.29963213e-01, -5.55093054e+00], [-1.01724010e+01, -8.42467961e+00], [-1.40787241e+00, 8.20386947e+00], [-8.86329013e+00, -6.46974160e+00], [-1.17867375e+01, -7.38627486e+00], [ 2.70003492e+00, -4.42959486e+00], [ 3.74018799e+00, 5.86277425e+00], [ 9.76970826e-01, -4.40432866e+00], [ 3.53853432e+00, 8.06478420e+00], [-1.36995537e-02, -4.41397335e+00], [-2.21406638e+00, 6.30414753e+00], [ 1.54081964e+00, -4.53702344e+00], [-9.78801442e+00, -8.52678592e+00], [-9.88043097e+00, -6.05108146e+00], [ 1.44867217e+00, -5.53713539e+00], [-1.04422633e+01, -6.74549318e+00], [ 6.83084725e-01, -6.91976383e+00], [-8.45020142e+00, -7.32711070e+00], [-9.73089464e-01, -4.06651907e+00], [ 2.84503785e-01, -3.61576523e+00], [-1.31673099e+00, 6.54706372e+00], [ 2.05907384e+00, 4.80547205e+00], [ 3.18199713e+00, 5.33264367e+00], [ 2.48573450e+00, 6.91547152e+00], [-6.73223235e+00, -4.23441407e+00], [ 2.16784691e+00, -6.16570792e+00], [ 2.34531543e+00, 6.74289385e+00], [-1.02040585e+01, -7.97090751e+00], [ 3.19485113e+00, 6.26478506e+00], [-1.19228021e+00, 6.14310847e+00], [-9.14758038e+00, -8.76805755e+00], [ 3.53993567e+00, 5.45180895e+00], [-9.61724546e+00, -7.90770302e+00], [ 3.42501883e+00, 6.99734348e+00], [-2.72690232e+00, 6.73825748e+00], [ 3.76567451e+00, 6.38497839e+00], [-1.05675606e+01, -4.10339044e+00], [ 2.20129016e+00, 6.70519206e+00], [-7.88653676e+00, -6.82121417e+00], [ 5.36321701e-01, -5.12183028e+00], [ 3.57358029e+00, 8.64954183e+00], [ 1.28366703e+00, 6.78013179e+00], [-8.38331511e+00, -7.74858090e+00], [ 3.74605338e+00, 6.15055885e+00], [ 1.64692124e+00, -4.00437943e+00], [-1.05324768e+01, -7.84355739e+00], [-9.49791635e+00, -8.39689798e+00], [ 2.89273011e-01, -5.63206350e+00], [ 4.30265348e+00, 6.60570489e+00], [-2.12627756e+00, 7.65870629e+00], [ 1.61855217e+00, -5.73960464e+00], [-3.05026421e+00, 8.94223661e+00], [ 1.55822031e+00, -3.74572223e+00], [ 3.26013324e+00, 6.43099946e+00], [-8.62817084e+00, -8.01420585e+00], [-1.74836105e+00, 5.46645575e+00], [-4.88300213e-01, -5.66504681e+00], [-1.21747799e+01, -7.91678822e+00], [-8.34630802e+00, -8.66130645e+00], [-8.36552370e-01, -5.56887330e+00], [ 3.07922486e+00, 6.38042572e+00], [-4.15961535e-01, 6.66600489e+00], [-9.81739294e+00, -8.17137695e+00], [ 2.88933650e+00, 5.93216577e+00], [-1.87511018e+00, 5.62449961e+00], [ 6.19210169e-01, -4.88278873e+00], [ 5.08677288e+00, 6.20404737e+00], [ 5.65144983e-01, 6.55222496e+00], [-1.28725738e+00, 6.21060091e+00], [-7.45479667e-01, -2.96189843e+00], [ 2.90270640e+00, -4.98332671e+00], [ 4.01803539e+00, 5.45558747e+00], [-3.39121202e-01, 8.97323488e+00], [-3.52483690e+00, 6.81598206e+00], [ 1.89502028e+00, -5.86480291e+00], [ 2.09040360e+00, 6.75543975e+00], [-2.80603999e+00, 6.99066209e+00], [ 1.45801414e+00, 5.95257044e+00], [ 2.74467226e+00, -4.80951565e+00], [-2.72740300e-01, 6.66523797e+00], [ 3.91373218e+00, 5.43922903e+00], [-1.05007367e+01, -8.55142948e+00], [-8.75816544e+00, -6.40864861e+00], [-9.48216559e+00, -7.72813495e+00], [ 4.90551479e+00, 6.11766373e+00], [-1.15211263e-01, -4.07510454e+00], [-8.30225088e+00, -6.55572135e+00], [-8.66454079e+00, -5.90508127e+00], [-3.19918761e-01, -4.98235849e+00], [-2.10963123e+00, 8.47168937e+00], [-7.19473257e-02, -5.26054466e+00], [-1.92546452e+00, 5.53701971e+00], [-2.92469111e+00, 6.25476272e+00], [-2.90664753e+00, 5.79835066e+00], [ 1.61854360e+00, -4.88855923e+00], [ 1.05261753e+00, -3.49553010e+00], [ 3.39585894e+00, 7.50999675e+00], [ 1.08927851e+00, -5.50265563e+00], [-7.60625341e+00, -8.50822003e+00], [-7.72653016e-01, 7.33138990e+00], [ 7.88487590e-01, -5.32234378e+00], [-8.37853170e-01, -4.06344834e+00], [-7.30021881e-01, 6.45740173e+00], [ 1.11746530e-01, -3.61615829e+00], [ 7.65227534e-02, 7.58891330e+00], [ 4.20540979e+00, 7.60887101e+00], [-9.50202446e+00, -9.45345197e+00], [-8.78006742e+00, -6.64156429e+00], [ 3.50775812e+00, 6.70677389e+00], [ 3.95479461e+00, 7.06022828e+00], [-9.05224646e-01, 5.98849223e+00], [ 2.27830898e+00, 6.76554973e+00], [ 7.89005756e-01, 6.73031309e+00], [ 3.23608770e+00, 7.69529426e+00], [-1.11759409e+01, -6.93311642e+00], [-6.98588694e-01, -3.52763818e+00], [-1.04953315e+01, -8.38204241e+00], [-1.02817702e+01, -7.10826837e+00], [ 6.78603005e-01, -4.17761085e+00], [-2.02407808e+00, 6.67946253e+00], [-9.17624916e-01, 6.45845823e+00], [ 3.99191575e+00, 6.60357995e+00], [-1.04334882e+00, 7.16550989e+00], [ 3.30709252e+00, 7.52045266e+00], [ 3.28538514e+00, 6.34622469e+00], [ 4.37872623e+00, 7.77078124e+00], [-1.82947923e+00, 5.74778105e+00], [-1.01908037e+01, -7.08820617e+00], [ 3.43857919e+00, 6.34659863e+00], [-1.05304951e+01, -7.67861298e+00], [ 1.10594346e+00, -4.41906375e+00], [ 6.16219697e-01, -5.27504804e+00], [-1.20859872e+00, 5.21062268e+00], [ 4.61164315e+00, 6.05221253e+00], [-1.96557020e+00, 7.54469557e+00], [ 3.21718440e+00, 5.66238231e+00], [ 1.93975803e-01, 6.17337188e+00], [ 5.87148931e-03, -3.18314256e+00], [ 4.33614078e-01, 5.44808653e+00], [-3.01723335e+00, 7.00340678e+00], [ 2.84271780e+00, 9.26217307e+00], [-9.42853058e+00, -8.99838025e+00], [-8.51802106e+00, -9.30400873e+00], [ 6.13631530e+00, 4.54419105e+00], [ 1.43136075e+00, 6.68419189e+00], [-1.51854685e+00, 6.35232464e+00], [ 1.59909918e+00, -3.07105617e+00], [-9.43762219e+00, -4.85961564e+00], [-7.67430564e-01, -5.47682218e+00], [ 2.14392371e+00, 7.44615522e+00], [ 5.25950490e+00, 6.18091147e+00], [ 4.27545710e+00, 6.27045780e+00], [ 2.48708050e+00, -2.89100712e+00], [ 2.97157371e+00, 7.25600900e+00], [ 3.79877054e+00, 7.85178586e+00], [ 2.05772110e+00, -6.12322912e+00], [-1.02169699e+01, -8.99327317e+00], [ 1.09049844e+00, -5.87582929e+00], [-5.60643409e-01, 6.87612506e+00], [-3.46772942e+00, 6.76072133e+00], [-1.10001801e+01, -7.80996633e+00], [ 1.89855120e+00, 6.84633761e+00], [ 3.23659674e-01, -5.10078403e+00]]), array([2, 2, 2, 2, 1, 2, 3, 3, 1, 1, 0, 1, 1, 1, 1, 1, 2, 1, 2, 1, 2, 2, 0, 3, 0, 0, 1, 2, 1, 0, 0, 3, 0, 2, 2, 1, 0, 0, 2, 1, 2, 2, 0, 3, 0, 3, 0, 1, 0, 2, 2, 0, 2, 0, 2, 0, 0, 1, 3, 3, 3, 2, 0, 3, 2, 3, 1, 2, 3, 2, 3, 1, 3, 2, 3, 2, 0, 3, 3, 2, 3, 0, 2, 2, 0, 3, 1, 0, 1, 0, 3, 2, 1, 0, 2, 2, 0, 3, 1, 2, 3, 1, 0, 3, 1, 1, 0, 0, 3, 1, 1, 0, 3, 1, 3, 0, 1, 3, 2, 2, 2, 3, 0, 2, 2, 0, 1, 0, 1, 1, 1, 0, 0, 3, 0, 2, 1, 0, 0, 1, 0, 1, 3, 2, 2, 3, 3, 1, 3, 1, 3, 2, 0, 2, 2, 0, 1, 1, 3, 1, 3, 3, 3, 1, 2, 3, 2, 0, 0, 1, 3, 1, 3, 1, 0, 1, 1, 3, 2, 2, 3, 3, 1, 0, 2, 0, 3, 3, 3, 0, 3, 3, 0, 2, 0, 1, 1, 2, 3, 0]))
인덱싱 하기
data[0]
array([[-1.04541152e+01, -7.62125255e+00], [-1.00696483e+01, -6.35203699e+00], [-9.72925240e+00, -8.44434843e+00], [-9.53601173e+00, -8.04924975e+00], [-3.00736857e+00, 5.70163667e+00], [-1.00825197e+01, -7.06157008e+00], [ 1.48356884e+00, 6.54127044e+00], [ 1.36630977e+00, 7.30084118e+00], [-6.15050479e-01, 7.65521577e+00], [-2.13385599e-01, 7.84779827e+00], [ 3.36818459e-01, -3.40287961e+00], [-1.37722038e+00, 6.91773657e+00], [-1.08726796e+00, 5.80147972e+00], [-1.63584937e+00, 8.88579630e+00], [-7.62592530e-01, 7.32519908e+00], [-1.93336328e+00, 5.70953908e+00], [-9.51418426e+00, -7.24137223e+00], [-9.86650380e-01, 6.87917724e+00], [-9.37825374e+00, -7.94292047e+00], [ 2.66345400e-01, 7.40859703e+00], [-1.20385333e+01, -6.80941325e+00], [-9.20343837e+00, -8.85252239e+00], [ 4.10071850e-01, -3.99744881e+00], [ 2.25174587e+00, 6.49587933e+00], [ 7.92075370e-01, -4.42865470e+00], [ 5.41860777e-01, -4.37693628e+00], [-7.56585997e-01, 5.28608375e+00], [-8.95266345e+00, -8.55198023e+00], [-1.68442974e+00, 7.91278699e+00], [ 1.48113772e+00, -3.69640708e+00], [ 8.75413399e-01, -5.04555103e+00], [ 3.00989843e+00, 7.05349904e+00], [ 9.76962304e-01, -3.92480271e+00], [-9.94328955e+00, -6.73708455e+00], [-9.79380030e+00, -5.02918555e+00], [-1.37939973e+00, 7.84488351e+00], [-2.10521664e+00, -4.39929502e+00], [ 4.29963213e-01, -5.55093054e+00], [-1.01724010e+01, -8.42467961e+00], [-1.40787241e+00, 8.20386947e+00], [-8.86329013e+00, -6.46974160e+00], [-1.17867375e+01, -7.38627486e+00], [ 2.70003492e+00, -4.42959486e+00], [ 3.74018799e+00, 5.86277425e+00], [ 9.76970826e-01, -4.40432866e+00], [ 3.53853432e+00, 8.06478420e+00], [-1.36995537e-02, -4.41397335e+00], [-2.21406638e+00, 6.30414753e+00], [ 1.54081964e+00, -4.53702344e+00], [-9.78801442e+00, -8.52678592e+00], [-9.88043097e+00, -6.05108146e+00], [ 1.44867217e+00, -5.53713539e+00], [-1.04422633e+01, -6.74549318e+00], [ 6.83084725e-01, -6.91976383e+00], [-8.45020142e+00, -7.32711070e+00], [-9.73089464e-01, -4.06651907e+00], [ 2.84503785e-01, -3.61576523e+00], [-1.31673099e+00, 6.54706372e+00], [ 2.05907384e+00, 4.80547205e+00], [ 3.18199713e+00, 5.33264367e+00], [ 2.48573450e+00, 6.91547152e+00], [-6.73223235e+00, -4.23441407e+00], [ 2.16784691e+00, -6.16570792e+00], [ 2.34531543e+00, 6.74289385e+00], [-1.02040585e+01, -7.97090751e+00], [ 3.19485113e+00, 6.26478506e+00], [-1.19228021e+00, 6.14310847e+00], [-9.14758038e+00, -8.76805755e+00], [ 3.53993567e+00, 5.45180895e+00], [-9.61724546e+00, -7.90770302e+00], [ 3.42501883e+00, 6.99734348e+00], [-2.72690232e+00, 6.73825748e+00], [ 3.76567451e+00, 6.38497839e+00], [-1.05675606e+01, -4.10339044e+00], [ 2.20129016e+00, 6.70519206e+00], [-7.88653676e+00, -6.82121417e+00], [ 5.36321701e-01, -5.12183028e+00], [ 3.57358029e+00, 8.64954183e+00], [ 1.28366703e+00, 6.78013179e+00], [-8.38331511e+00, -7.74858090e+00], [ 3.74605338e+00, 6.15055885e+00], [ 1.64692124e+00, -4.00437943e+00], [-1.05324768e+01, -7.84355739e+00], [-9.49791635e+00, -8.39689798e+00], [ 2.89273011e-01, -5.63206350e+00], [ 4.30265348e+00, 6.60570489e+00], [-2.12627756e+00, 7.65870629e+00], [ 1.61855217e+00, -5.73960464e+00], [-3.05026421e+00, 8.94223661e+00], [ 1.55822031e+00, -3.74572223e+00], [ 3.26013324e+00, 6.43099946e+00], [-8.62817084e+00, -8.01420585e+00], [-1.74836105e+00, 5.46645575e+00], [-4.88300213e-01, -5.66504681e+00], [-1.21747799e+01, -7.91678822e+00], [-8.34630802e+00, -8.66130645e+00], [-8.36552370e-01, -5.56887330e+00], [ 3.07922486e+00, 6.38042572e+00], [-4.15961535e-01, 6.66600489e+00], [-9.81739294e+00, -8.17137695e+00], [ 2.88933650e+00, 5.93216577e+00], [-1.87511018e+00, 5.62449961e+00], [ 6.19210169e-01, -4.88278873e+00], [ 5.08677288e+00, 6.20404737e+00], [ 5.65144983e-01, 6.55222496e+00], [-1.28725738e+00, 6.21060091e+00], [-7.45479667e-01, -2.96189843e+00], [ 2.90270640e+00, -4.98332671e+00], [ 4.01803539e+00, 5.45558747e+00], [-3.39121202e-01, 8.97323488e+00], [-3.52483690e+00, 6.81598206e+00], [ 1.89502028e+00, -5.86480291e+00], [ 2.09040360e+00, 6.75543975e+00], [-2.80603999e+00, 6.99066209e+00], [ 1.45801414e+00, 5.95257044e+00], [ 2.74467226e+00, -4.80951565e+00], [-2.72740300e-01, 6.66523797e+00], [ 3.91373218e+00, 5.43922903e+00], [-1.05007367e+01, -8.55142948e+00], [-8.75816544e+00, -6.40864861e+00], [-9.48216559e+00, -7.72813495e+00], [ 4.90551479e+00, 6.11766373e+00], [-1.15211263e-01, -4.07510454e+00], [-8.30225088e+00, -6.55572135e+00], [-8.66454079e+00, -5.90508127e+00], [-3.19918761e-01, -4.98235849e+00], [-2.10963123e+00, 8.47168937e+00], [-7.19473257e-02, -5.26054466e+00], [-1.92546452e+00, 5.53701971e+00], [-2.92469111e+00, 6.25476272e+00], [-2.90664753e+00, 5.79835066e+00], [ 1.61854360e+00, -4.88855923e+00], [ 1.05261753e+00, -3.49553010e+00], [ 3.39585894e+00, 7.50999675e+00], [ 1.08927851e+00, -5.50265563e+00], [-7.60625341e+00, -8.50822003e+00], [-7.72653016e-01, 7.33138990e+00], [ 7.88487590e-01, -5.32234378e+00], [-8.37853170e-01, -4.06344834e+00], [-7.30021881e-01, 6.45740173e+00], [ 1.11746530e-01, -3.61615829e+00], [ 7.65227534e-02, 7.58891330e+00], [ 4.20540979e+00, 7.60887101e+00], [-9.50202446e+00, -9.45345197e+00], [-8.78006742e+00, -6.64156429e+00], [ 3.50775812e+00, 6.70677389e+00], [ 3.95479461e+00, 7.06022828e+00], [-9.05224646e-01, 5.98849223e+00], [ 2.27830898e+00, 6.76554973e+00], [ 7.89005756e-01, 6.73031309e+00], [ 3.23608770e+00, 7.69529426e+00], [-1.11759409e+01, -6.93311642e+00], [-6.98588694e-01, -3.52763818e+00], [-1.04953315e+01, -8.38204241e+00], [-1.02817702e+01, -7.10826837e+00], [ 6.78603005e-01, -4.17761085e+00], [-2.02407808e+00, 6.67946253e+00], [-9.17624916e-01, 6.45845823e+00], [ 3.99191575e+00, 6.60357995e+00], [-1.04334882e+00, 7.16550989e+00], [ 3.30709252e+00, 7.52045266e+00], [ 3.28538514e+00, 6.34622469e+00], [ 4.37872623e+00, 7.77078124e+00], [-1.82947923e+00, 5.74778105e+00], [-1.01908037e+01, -7.08820617e+00], [ 3.43857919e+00, 6.34659863e+00], [-1.05304951e+01, -7.67861298e+00], [ 1.10594346e+00, -4.41906375e+00], [ 6.16219697e-01, -5.27504804e+00], [-1.20859872e+00, 5.21062268e+00], [ 4.61164315e+00, 6.05221253e+00], [-1.96557020e+00, 7.54469557e+00], [ 3.21718440e+00, 5.66238231e+00], [ 1.93975803e-01, 6.17337188e+00], [ 5.87148931e-03, -3.18314256e+00], [ 4.33614078e-01, 5.44808653e+00], [-3.01723335e+00, 7.00340678e+00], [ 2.84271780e+00, 9.26217307e+00], [-9.42853058e+00, -8.99838025e+00], [-8.51802106e+00, -9.30400873e+00], [ 6.13631530e+00, 4.54419105e+00], [ 1.43136075e+00, 6.68419189e+00], [-1.51854685e+00, 6.35232464e+00], [ 1.59909918e+00, -3.07105617e+00], [-9.43762219e+00, -4.85961564e+00], [-7.67430564e-01, -5.47682218e+00], [ 2.14392371e+00, 7.44615522e+00], [ 5.25950490e+00, 6.18091147e+00], [ 4.27545710e+00, 6.27045780e+00], [ 2.48708050e+00, -2.89100712e+00], [ 2.97157371e+00, 7.25600900e+00], [ 3.79877054e+00, 7.85178586e+00], [ 2.05772110e+00, -6.12322912e+00], [-1.02169699e+01, -8.99327317e+00], [ 1.09049844e+00, -5.87582929e+00], [-5.60643409e-01, 6.87612506e+00], [-3.46772942e+00, 6.76072133e+00], [-1.10001801e+01, -7.80996633e+00], [ 1.89855120e+00, 6.84633761e+00], [ 3.23659674e-01, -5.10078403e+00]])
data[1]
array([2, 2, 2, 2, 1, 2, 3, 3, 1, 1, 0, 1, 1, 1, 1, 1, 2, 1, 2, 1, 2, 2, 0, 3, 0, 0, 1, 2, 1, 0, 0, 3, 0, 2, 2, 1, 0, 0, 2, 1, 2, 2, 0, 3, 0, 3, 0, 1, 0, 2, 2, 0, 2, 0, 2, 0, 0, 1, 3, 3, 3, 2, 0, 3, 2, 3, 1, 2, 3, 2, 3, 1, 3, 2, 3, 2, 0, 3, 3, 2, 3, 0, 2, 2, 0, 3, 1, 0, 1, 0, 3, 2, 1, 0, 2, 2, 0, 3, 1, 2, 3, 1, 0, 3, 1, 1, 0, 0, 3, 1, 1, 0, 3, 1, 3, 0, 1, 3, 2, 2, 2, 3, 0, 2, 2, 0, 1, 0, 1, 1, 1, 0, 0, 3, 0, 2, 1, 0, 0, 1, 0, 1, 3, 2, 2, 3, 3, 1, 3, 1, 3, 2, 0, 2, 2, 0, 1, 1, 3, 1, 3, 3, 3, 1, 2, 3, 2, 0, 0, 1, 3, 1, 3, 1, 0, 1, 1, 3, 2, 2, 3, 3, 1, 0, 2, 0, 3, 3, 3, 0, 3, 3, 0, 2, 0, 1, 1, 2, 3, 0])
data[0][:,0]
array([-1.04541152e+01, -1.00696483e+01, -9.72925240e+00, -9.53601173e+00, -3.00736857e+00, -1.00825197e+01, 1.48356884e+00, 1.36630977e+00, -6.15050479e-01, -2.13385599e-01, 3.36818459e-01, -1.37722038e+00, -1.08726796e+00, -1.63584937e+00, -7.62592530e-01, -1.93336328e+00, -9.51418426e+00, -9.86650380e-01, -9.37825374e+00, 2.66345400e-01, -1.20385333e+01, -9.20343837e+00, 4.10071850e-01, 2.25174587e+00, 7.92075370e-01, 5.41860777e-01, -7.56585997e-01, -8.95266345e+00, -1.68442974e+00, 1.48113772e+00, 8.75413399e-01, 3.00989843e+00, 9.76962304e-01, -9.94328955e+00, -9.79380030e+00, -1.37939973e+00, -2.10521664e+00, 4.29963213e-01, -1.01724010e+01, -1.40787241e+00, -8.86329013e+00, -1.17867375e+01, 2.70003492e+00, 3.74018799e+00, 9.76970826e-01, 3.53853432e+00, -1.36995537e-02, -2.21406638e+00, 1.54081964e+00, -9.78801442e+00, -9.88043097e+00, 1.44867217e+00, -1.04422633e+01, 6.83084725e-01, -8.45020142e+00, -9.73089464e-01, 2.84503785e-01, -1.31673099e+00, 2.05907384e+00, 3.18199713e+00, 2.48573450e+00, -6.73223235e+00, 2.16784691e+00, 2.34531543e+00, -1.02040585e+01, 3.19485113e+00, -1.19228021e+00, -9.14758038e+00, 3.53993567e+00, -9.61724546e+00, 3.42501883e+00, -2.72690232e+00, 3.76567451e+00, -1.05675606e+01, 2.20129016e+00, -7.88653676e+00, 5.36321701e-01, 3.57358029e+00, 1.28366703e+00, -8.38331511e+00, 3.74605338e+00, 1.64692124e+00, -1.05324768e+01, -9.49791635e+00, 2.89273011e-01, 4.30265348e+00, -2.12627756e+00, 1.61855217e+00, -3.05026421e+00, 1.55822031e+00, 3.26013324e+00, -8.62817084e+00, -1.74836105e+00, -4.88300213e-01, -1.21747799e+01, -8.34630802e+00, -8.36552370e-01, 3.07922486e+00, -4.15961535e-01, -9.81739294e+00, 2.88933650e+00, -1.87511018e+00, 6.19210169e-01, 5.08677288e+00, 5.65144983e-01, -1.28725738e+00, -7.45479667e-01, 2.90270640e+00, 4.01803539e+00, -3.39121202e-01, -3.52483690e+00, 1.89502028e+00, 2.09040360e+00, -2.80603999e+00, 1.45801414e+00, 2.74467226e+00, -2.72740300e-01, 3.91373218e+00, -1.05007367e+01, -8.75816544e+00, -9.48216559e+00, 4.90551479e+00, -1.15211263e-01, -8.30225088e+00, -8.66454079e+00, -3.19918761e-01, -2.10963123e+00, -7.19473257e-02, -1.92546452e+00, -2.92469111e+00, -2.90664753e+00, 1.61854360e+00, 1.05261753e+00, 3.39585894e+00, 1.08927851e+00, -7.60625341e+00, -7.72653016e-01, 7.88487590e-01, -8.37853170e-01, -7.30021881e-01, 1.11746530e-01, 7.65227534e-02, 4.20540979e+00, -9.50202446e+00, -8.78006742e+00, 3.50775812e+00, 3.95479461e+00, -9.05224646e-01, 2.27830898e+00, 7.89005756e-01, 3.23608770e+00, -1.11759409e+01, -6.98588694e-01, -1.04953315e+01, -1.02817702e+01, 6.78603005e-01, -2.02407808e+00, -9.17624916e-01, 3.99191575e+00, -1.04334882e+00, 3.30709252e+00, 3.28538514e+00, 4.37872623e+00, -1.82947923e+00, -1.01908037e+01, 3.43857919e+00, -1.05304951e+01, 1.10594346e+00, 6.16219697e-01, -1.20859872e+00, 4.61164315e+00, -1.96557020e+00, 3.21718440e+00, 1.93975803e-01, 5.87148931e-03, 4.33614078e-01, -3.01723335e+00, 2.84271780e+00, -9.42853058e+00, -8.51802106e+00, 6.13631530e+00, 1.43136075e+00, -1.51854685e+00, 1.59909918e+00, -9.43762219e+00, -7.67430564e-01, 2.14392371e+00, 5.25950490e+00, 4.27545710e+00, 2.48708050e+00, 2.97157371e+00, 3.79877054e+00, 2.05772110e+00, -1.02169699e+01, 1.09049844e+00, -5.60643409e-01, -3.46772942e+00, -1.10001801e+01, 1.89855120e+00, 3.23659674e-01])
data[0][0]
array([-10.45411516, -7.62125255])
컬럼단위로 인덱싱 하기
data[0][:, 0]
array([-1.04541152e+01, -1.00696483e+01, -9.72925240e+00, -9.53601173e+00, -3.00736857e+00, -1.00825197e+01, 1.48356884e+00, 1.36630977e+00, -6.15050479e-01, -2.13385599e-01, 3.36818459e-01, -1.37722038e+00, -1.08726796e+00, -1.63584937e+00, -7.62592530e-01, -1.93336328e+00, -9.51418426e+00, -9.86650380e-01, -9.37825374e+00, 2.66345400e-01, -1.20385333e+01, -9.20343837e+00, 4.10071850e-01, 2.25174587e+00, 7.92075370e-01, 5.41860777e-01, -7.56585997e-01, -8.95266345e+00, -1.68442974e+00, 1.48113772e+00, 8.75413399e-01, 3.00989843e+00, 9.76962304e-01, -9.94328955e+00, -9.79380030e+00, -1.37939973e+00, -2.10521664e+00, 4.29963213e-01, -1.01724010e+01, -1.40787241e+00, -8.86329013e+00, -1.17867375e+01, 2.70003492e+00, 3.74018799e+00, 9.76970826e-01, 3.53853432e+00, -1.36995537e-02, -2.21406638e+00, 1.54081964e+00, -9.78801442e+00, -9.88043097e+00, 1.44867217e+00, -1.04422633e+01, 6.83084725e-01, -8.45020142e+00, -9.73089464e-01, 2.84503785e-01, -1.31673099e+00, 2.05907384e+00, 3.18199713e+00, 2.48573450e+00, -6.73223235e+00, 2.16784691e+00, 2.34531543e+00, -1.02040585e+01, 3.19485113e+00, -1.19228021e+00, -9.14758038e+00, 3.53993567e+00, -9.61724546e+00, 3.42501883e+00, -2.72690232e+00, 3.76567451e+00, -1.05675606e+01, 2.20129016e+00, -7.88653676e+00, 5.36321701e-01, 3.57358029e+00, 1.28366703e+00, -8.38331511e+00, 3.74605338e+00, 1.64692124e+00, -1.05324768e+01, -9.49791635e+00, 2.89273011e-01, 4.30265348e+00, -2.12627756e+00, 1.61855217e+00, -3.05026421e+00, 1.55822031e+00, 3.26013324e+00, -8.62817084e+00, -1.74836105e+00, -4.88300213e-01, -1.21747799e+01, -8.34630802e+00, -8.36552370e-01, 3.07922486e+00, -4.15961535e-01, -9.81739294e+00, 2.88933650e+00, -1.87511018e+00, 6.19210169e-01, 5.08677288e+00, 5.65144983e-01, -1.28725738e+00, -7.45479667e-01, 2.90270640e+00, 4.01803539e+00, -3.39121202e-01, -3.52483690e+00, 1.89502028e+00, 2.09040360e+00, -2.80603999e+00, 1.45801414e+00, 2.74467226e+00, -2.72740300e-01, 3.91373218e+00, -1.05007367e+01, -8.75816544e+00, -9.48216559e+00, 4.90551479e+00, -1.15211263e-01, -8.30225088e+00, -8.66454079e+00, -3.19918761e-01, -2.10963123e+00, -7.19473257e-02, -1.92546452e+00, -2.92469111e+00, -2.90664753e+00, 1.61854360e+00, 1.05261753e+00, 3.39585894e+00, 1.08927851e+00, -7.60625341e+00, -7.72653016e-01, 7.88487590e-01, -8.37853170e-01, -7.30021881e-01, 1.11746530e-01, 7.65227534e-02, 4.20540979e+00, -9.50202446e+00, -8.78006742e+00, 3.50775812e+00, 3.95479461e+00, -9.05224646e-01, 2.27830898e+00, 7.89005756e-01, 3.23608770e+00, -1.11759409e+01, -6.98588694e-01, -1.04953315e+01, -1.02817702e+01, 6.78603005e-01, -2.02407808e+00, -9.17624916e-01, 3.99191575e+00, -1.04334882e+00, 3.30709252e+00, 3.28538514e+00, 4.37872623e+00, -1.82947923e+00, -1.01908037e+01, 3.43857919e+00, -1.05304951e+01, 1.10594346e+00, 6.16219697e-01, -1.20859872e+00, 4.61164315e+00, -1.96557020e+00, 3.21718440e+00, 1.93975803e-01, 5.87148931e-03, 4.33614078e-01, -3.01723335e+00, 2.84271780e+00, -9.42853058e+00, -8.51802106e+00, 6.13631530e+00, 1.43136075e+00, -1.51854685e+00, 1.59909918e+00, -9.43762219e+00, -7.67430564e-01, 2.14392371e+00, 5.25950490e+00, 4.27545710e+00, 2.48708050e+00, 2.97157371e+00, 3.79877054e+00, 2.05772110e+00, -1.02169699e+01, 1.09049844e+00, -5.60643409e-01, -3.46772942e+00, -1.10001801e+01, 1.89855120e+00, 3.23659674e-01])
sns.scatterplot( x = data[0][:, 0], y = data[0][:, 1], hue = data[1], palette = 'Set2')
<AxesSubplot:>
from sklearn.cluster import KMeans
model = KMeans(n_clusters = 4)
model.fit(data[0])
KMeans(n_clusters=4)
model.labels_
array([2, 2, 2, 2, 3, 2, 0, 0, 3, 3, 1, 3, 3, 3, 3, 3, 2, 3, 2, 3, 2, 2, 1, 0, 1, 1, 3, 2, 3, 1, 1, 0, 1, 2, 2, 3, 1, 1, 2, 3, 2, 2, 1, 0, 1, 0, 1, 3, 1, 2, 2, 1, 2, 1, 2, 1, 1, 3, 0, 0, 0, 2, 1, 0, 2, 0, 3, 2, 0, 2, 0, 3, 0, 2, 0, 2, 1, 0, 0, 2, 0, 1, 2, 2, 1, 0, 3, 1, 3, 1, 0, 2, 3, 1, 2, 2, 1, 0, 3, 2, 0, 3, 1, 0, 3, 3, 1, 1, 0, 3, 3, 1, 0, 3, 0, 1, 3, 0, 2, 2, 2, 0, 1, 2, 2, 1, 3, 1, 3, 3, 3, 1, 1, 0, 1, 2, 3, 1, 1, 3, 1, 3, 0, 2, 2, 0, 0, 3, 0, 3, 0, 2, 1, 2, 2, 1, 3, 3, 0, 3, 0, 0, 0, 3, 2, 0, 2, 1, 1, 3, 0, 3, 0, 3, 1, 3, 3, 0, 2, 2, 0, 0, 3, 1, 2, 1, 0, 0, 0, 1, 0, 0, 1, 2, 1, 3, 3, 2, 0, 1], dtype=int32)
data[1]
array([2, 2, 2, 2, 1, 2, 3, 3, 1, 1, 0, 1, 1, 1, 1, 1, 2, 1, 2, 1, 2, 2, 0, 3, 0, 0, 1, 2, 1, 0, 0, 3, 0, 2, 2, 1, 0, 0, 2, 1, 2, 2, 0, 3, 0, 3, 0, 1, 0, 2, 2, 0, 2, 0, 2, 0, 0, 1, 3, 3, 3, 2, 0, 3, 2, 3, 1, 2, 3, 2, 3, 1, 3, 2, 3, 2, 0, 3, 3, 2, 3, 0, 2, 2, 0, 3, 1, 0, 1, 0, 3, 2, 1, 0, 2, 2, 0, 3, 1, 2, 3, 1, 0, 3, 1, 1, 0, 0, 3, 1, 1, 0, 3, 1, 3, 0, 1, 3, 2, 2, 2, 3, 0, 2, 2, 0, 1, 0, 1, 1, 1, 0, 0, 3, 0, 2, 1, 0, 0, 1, 0, 1, 3, 2, 2, 3, 3, 1, 3, 1, 3, 2, 0, 2, 2, 0, 1, 1, 3, 1, 3, 3, 3, 1, 2, 3, 2, 0, 0, 1, 3, 1, 3, 1, 0, 1, 1, 3, 2, 2, 3, 3, 1, 0, 2, 0, 3, 3, 3, 0, 3, 3, 0, 2, 0, 1, 1, 2, 3, 0])
답안지
sns.scatterplot( x = data[0][:, 0], y = data[0][:, 1], hue = data[1], palette = 'Set2')
<AxesSubplot:>
sns.scatterplot( x = data[0][:, 0], y = data[0][:, 1], hue = model.labels_ , palette = 'Set2')
<AxesSubplot:>
n_clusters 값을 3으로 조정하여 3그룹으로 만들기
model = KMeans(n_clusters = 3)
model.fit(data[0])
KMeans(n_clusters=3)
sns.scatterplot( x = data[0][:, 0], y = data[0][:, 1], hue = model.labels_ , palette = 'Set2')
<AxesSubplot:>
5 그룹으로 만들기
model = KMeans(n_clusters = 5)
model.fit(data[0])
sns.scatterplot( x = data[0][:, 0], y = data[0][:, 1], hue = model.labels_ , palette = 'Set2')
<AxesSubplot:>
model.inertia_
377.0923630431555
n_clusters의 수치에 관해 distance 거리 확인하기. for 문을 통해 2~ 10 까지 ..
distance = []
for i in range(2,11):
model = KMeans(n_clusters = i)
model.fit(data[0])
distance.append(model.inertia_)
distance
[3855.3908583820526, 967.5329197924485, 428.6910143243812, 377.0923630431555, 333.0919199257052, 302.838248252247, 271.9107332802928, 239.1770373968992, 214.00356496738334]
그래프를 그려서 가장 좋은 clusters 를 확인해 보자.
sns.lineplot(x = list(range(2,11)), y = distance)
<AxesSubplot:>
4 일때가 최적의 지점이다. Elbow Plot
이제 실전 예제를 다루어 보자.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
data 를 불러오자. 그리고 CustomerID는 중요하지 않은 data 임으로 index_col 을 사용하자.
data = pd.read_csv('./data/Mall_Customers.csv', index_col = 0)
data.head()
Gender | Age | Annual Income (k$) | Spending Score (1-100) | |
---|---|---|---|---|
CustomerID | ||||
1 | Male | 19 | 15 | 39 |
2 | Male | 21 | 15 | 81 |
3 | Female | 20 | 16 | 6 |
4 | Female | 23 | 16 | 77 |
5 | Female | 31 | 17 | 40 |
data.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 200 entries, 1 to 200 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Gender 200 non-null object 1 Age 200 non-null int64 2 Annual Income (k$) 200 non-null int64 3 Spending Score (1-100) 200 non-null int64 dtypes: int64(3), object(1) memory usage: 7.8+ KB
data.describe()
Age | Annual Income (k$) | Spending Score (1-100) | |
---|---|---|---|
count | 200.000000 | 200.000000 | 200.000000 |
mean | 38.850000 | 60.560000 | 50.200000 |
std | 13.969007 | 26.264721 | 25.823522 |
min | 18.000000 | 15.000000 | 1.000000 |
25% | 28.750000 | 41.500000 | 34.750000 |
50% | 36.000000 | 61.500000 | 50.000000 |
75% | 49.000000 | 78.000000 | 73.000000 |
max | 70.000000 | 137.000000 | 99.000000 |
결측치는 없다.
하지만 Gneder 컬럼은 더미 함수를 사용해서 텍스트 데이터를 숫자 데이터로 바꿔주자.
pd.get_dummies(data, columns=['Gender'], drop_first = True)
Age | Annual Income (k$) | Spending Score (1-100) | Gender_Male | |
---|---|---|---|---|
CustomerID | ||||
1 | 19 | 15 | 39 | 1 |
2 | 21 | 15 | 81 | 1 |
3 | 20 | 16 | 6 | 0 |
4 | 23 | 16 | 77 | 0 |
5 | 31 | 17 | 40 | 0 |
... | ... | ... | ... | ... |
196 | 35 | 120 | 79 | 0 |
197 | 45 | 126 | 28 | 0 |
198 | 32 | 126 | 74 | 1 |
199 | 32 | 137 | 18 | 1 |
200 | 30 | 137 | 83 | 1 |
200 rows × 4 columns
data = pd.get_dummies(data, columns=['Gender'], drop_first = True)
data
Age | Annual Income (k$) | Spending Score (1-100) | Gender_Male | |
---|---|---|---|---|
CustomerID | ||||
1 | 19 | 15 | 39 | 1 |
2 | 21 | 15 | 81 | 1 |
3 | 20 | 16 | 6 | 0 |
4 | 23 | 16 | 77 | 0 |
5 | 31 | 17 | 40 | 0 |
... | ... | ... | ... | ... |
196 | 35 | 120 | 79 | 0 |
197 | 45 | 126 | 28 | 0 |
198 | 32 | 126 | 74 | 1 |
199 | 32 | 137 | 18 | 1 |
200 | 30 | 137 | 83 | 1 |
200 rows × 4 columns
이제 data가 잘 정리 되었다. KMeans 를 실행해보자.
from sklearn.cluster import KMeans
model = KMeans(n_clusters = 3)
model.fit(data)
KMeans(n_clusters=3)
model.labels_
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2], dtype=int32)
data를 result_df 로 하나 복사 해둔다.
result_df = data.copy()
result_df
Age | Annual Income (k$) | Spending Score (1-100) | Gender_Male | |
---|---|---|---|---|
CustomerID | ||||
1 | 19 | 15 | 39 | 1 |
2 | 21 | 15 | 81 | 1 |
3 | 20 | 16 | 6 | 0 |
4 | 23 | 16 | 77 | 0 |
5 | 31 | 17 | 40 | 0 |
... | ... | ... | ... | ... |
196 | 35 | 120 | 79 | 0 |
197 | 45 | 126 | 28 | 0 |
198 | 32 | 126 | 74 | 1 |
199 | 32 | 137 | 18 | 1 |
200 | 30 | 137 | 83 | 1 |
200 rows × 4 columns
result_df['label'] = model.labels_
result_df
Age | Annual Income (k$) | Spending Score (1-100) | Gender_Male | label | |
---|---|---|---|---|---|
CustomerID | |||||
1 | 19 | 15 | 39 | 1 | 0 |
2 | 21 | 15 | 81 | 1 | 0 |
3 | 20 | 16 | 6 | 0 | 0 |
4 | 23 | 16 | 77 | 0 | 0 |
5 | 31 | 17 | 40 | 0 | 0 |
... | ... | ... | ... | ... | ... |
196 | 35 | 120 | 79 | 0 | 2 |
197 | 45 | 126 | 28 | 0 | 1 |
198 | 32 | 126 | 74 | 1 | 2 |
199 | 32 | 137 | 18 | 1 | 1 |
200 | 30 | 137 | 83 | 1 | 2 |
200 rows × 5 columns
result_df.groupby('label').mean()
Age | Annual Income (k$) | Spending Score (1-100) | Gender_Male | |
---|---|---|---|---|
label | ||||
0 | 40.325203 | 44.154472 | 49.829268 | 0.406504 |
1 | 40.394737 | 87.000000 | 18.631579 | 0.526316 |
2 | 32.692308 | 86.538462 | 82.128205 | 0.461538 |
result_df['label'].value_counts()
0 123 2 39 1 38 Name: label, dtype: int64
distance = []
for i in range(2,11):
model = KMeans(n_clusters = i)
model.fit(data)
distance.append(model.inertia_)
distance
[212889.44245524294, 143391.59236035674, 104422.83498539752, 75528.58656469136, 58348.64136331504, 51165.184237107904, 44357.32664902663, 40891.680830865036, 37509.97941426389]
sns.lineplot( x=list(range(2,11)), y = distance)
<AxesSubplot:>
이번에는 Elbow Plot 으로 판단하기 어려운 그림이 나타났다. 그래서 다른 방법으로 최적의 k 값 찾기를 해보려 한다.
실루엣 스코어
from sklearn.metrics import silhouette_score
silhouette_score(data, model.labels_)
0.3787177701012805
sil = []
for i in range(2,11):
model = KMeans(n_clusters = i)
model.fit(data)
sil.append(silhouette_score(data, model.labels_))
sil
[0.29307334005502633, 0.383798873822341, 0.4052954330641215, 0.4440669204743008, 0.45205475380756527, 0.4409411333609709, 0.4304396746257457, 0.41642911077666755, 0.40064258067585135]
sns.lineplot(x = list(range(2,11)), y = sil)
<AxesSubplot:>
silhouette_score 의 경우, k 값이 클수록 좋은 수치이다. 그리고 클러스터가 무작정 늘어난다고 해서 개선되지 않는다.
- 따라서 클러스터 값은 6이 가장 좋은 수치를 나타남을 확인 했다.
model = KMeans(n_clusters = 6)
model.fit(data)
KMeans(n_clusters=6)
data
Age | Annual Income (k$) | Spending Score (1-100) | Gender_Male | |
---|---|---|---|---|
CustomerID | ||||
1 | 19 | 15 | 39 | 1 |
2 | 21 | 15 | 81 | 1 |
3 | 20 | 16 | 6 | 0 |
4 | 23 | 16 | 77 | 0 |
5 | 31 | 17 | 40 | 0 |
... | ... | ... | ... | ... |
196 | 35 | 120 | 79 | 0 |
197 | 45 | 126 | 28 | 0 |
198 | 32 | 126 | 74 | 1 |
199 | 32 | 137 | 18 | 1 |
200 | 30 | 137 | 83 | 1 |
200 rows × 4 columns
data['label'] = model.labels_
data
Age | Annual Income (k$) | Spending Score (1-100) | Gender_Male | label | |
---|---|---|---|---|---|
CustomerID | |||||
1 | 19 | 15 | 39 | 1 | 2 |
2 | 21 | 15 | 81 | 1 | 4 |
3 | 20 | 16 | 6 | 0 | 2 |
4 | 23 | 16 | 77 | 0 | 4 |
5 | 31 | 17 | 40 | 0 | 2 |
... | ... | ... | ... | ... | ... |
196 | 35 | 120 | 79 | 0 | 3 |
197 | 45 | 126 | 28 | 0 | 1 |
198 | 32 | 126 | 74 | 1 | 3 |
199 | 32 | 137 | 18 | 1 | 1 |
200 | 30 | 137 | 83 | 1 | 3 |
200 rows × 5 columns
data.groupby('label').mean()
Age | Annual Income (k$) | Spending Score (1-100) | Gender_Male | |
---|---|---|---|---|
label | ||||
0 | 27.000000 | 56.657895 | 49.131579 | 0.342105 |
1 | 41.685714 | 88.228571 | 17.285714 | 0.571429 |
2 | 44.142857 | 25.142857 | 19.523810 | 0.380952 |
3 | 32.692308 | 86.538462 | 82.128205 | 0.461538 |
4 | 25.272727 | 25.727273 | 79.363636 | 0.409091 |
5 | 56.155556 | 53.377778 | 49.088889 | 0.444444 |
sns.boxplot(x = 'label', y = 'Age', data = data)
<AxesSubplot:xlabel='label', ylabel='Age'>
sns.boxplot(x = 'label', y = 'Annual Income (k$)', data = data)
<AxesSubplot:xlabel='label', ylabel='Annual Income (k$)'>
sns.boxplot(x = 'label', y = 'Spending Score (1-100)', data = data)
<AxesSubplot:xlabel='label', ylabel='Spending Score (1-100)'>
- 여기서 상식밖의 인사이트한 결과가 하나 보인다.
- 0 번 레이블과 , 3 번 레이블을 보면, 연령은 비슷하다. 연간 소득은 0 번 레이블이 3번 레이블 보다 높다. 그래서 구매력 또한 같은 경향일 것으로 예상했으나 반대의 경향이 나타났다.
PCA 주성분 분석¶
data
Age | Annual Income (k$) | Spending Score (1-100) | Gender_Male | label | |
---|---|---|---|---|---|
CustomerID | |||||
1 | 19 | 15 | 39 | 1 | 2 |
2 | 21 | 15 | 81 | 1 | 4 |
3 | 20 | 16 | 6 | 0 | 2 |
4 | 23 | 16 | 77 | 0 | 4 |
5 | 31 | 17 | 40 | 0 | 2 |
... | ... | ... | ... | ... | ... |
196 | 35 | 120 | 79 | 0 | 3 |
197 | 45 | 126 | 28 | 0 | 1 |
198 | 32 | 126 | 74 | 1 | 3 |
199 | 32 | 137 | 18 | 1 | 1 |
200 | 30 | 137 | 83 | 1 | 3 |
200 rows × 5 columns
label은 영향을 줄 수 있으므로, 드랍시킨다.
data.drop('label', axis = 1, inplace = True)
data
Age | Annual Income (k$) | Spending Score (1-100) | Gender_Male | |
---|---|---|---|---|
CustomerID | ||||
1 | 19 | 15 | 39 | 1 |
2 | 21 | 15 | 81 | 1 |
3 | 20 | 16 | 6 | 0 |
4 | 23 | 16 | 77 | 0 |
5 | 31 | 17 | 40 | 0 |
... | ... | ... | ... | ... |
196 | 35 | 120 | 79 | 0 |
197 | 45 | 126 | 28 | 0 |
198 | 32 | 126 | 74 | 1 |
199 | 32 | 137 | 18 | 1 |
200 | 30 | 137 | 83 | 1 |
200 rows × 4 columns
from sklearn.decomposition import PCA
pca = PCA(n_components = 2) # 컬럼을 2개로
pca.fit(data)
PCA(n_components=2)
pca.transform(data)
array([[-3.18699448e+01, -3.30012521e+01], [ 7.64494048e-01, -5.68429006e+01], [-5.74082757e+01, -1.31249607e+01], [-2.16854252e+00, -5.34785900e+01], [-3.21740846e+01, -3.03884119e+01], [-2.17695183e+00, -5.22272685e+01], [-5.90656895e+01, -9.54376302e+00], [ 1.23708622e+01, -6.16180208e+01], [-6.63157690e+01, -3.21423160e+00], [-5.65556212e+00, -4.72672225e+01], [-5.82365979e+01, -9.13418751e+00], [ 1.46218043e+01, -6.21075685e+01], [-5.51608374e+01, -1.00799251e+01], [-3.02878331e-03, -5.01140754e+01], [-5.27646657e+01, -1.16807468e+01], [ 1.94661180e+00, -5.15222086e+01], [-3.45055496e+01, -2.37615402e+01], [-7.30493262e+00, -4.35151138e+01], [-4.12573070e+01, -1.64734737e+01], [ 1.61902860e+01, -5.83001207e+01], [-3.27400141e+01, -2.13347966e+01], [-9.81885245e-01, -4.44524038e+01], [-5.78100898e+01, -1.87057486e+00], [-1.52712624e+00, -4.28582471e+01], [-5.04819556e+01, -3.56276971e+00], [ 7.69078091e+00, -4.58600560e+01], [-3.46329913e+01, -1.50715244e+01], [-9.94927953e+00, -3.30225018e+01], [-3.38854915e+01, -1.43440485e+01], [ 1.33436470e+01, -4.87084876e+01], [-5.82990002e+01, 4.58029961e+00], [ 3.30609318e+00, -4.01274983e+01], [-5.52102909e+01, 6.08873470e+00], [ 2.05727386e+01, -4.89976354e+01], [-4.65939553e+01, -1.75635193e-01], [ 1.13600364e+01, -4.22934898e+01], [-4.23244634e+01, -2.00566039e+00], [ 3.95978859e+00, -3.57152924e+01], [-3.23506471e+01, -5.53131490e+00], [ 9.18744850e+00, -3.57475306e+01], [-3.01682815e+01, -6.09009687e+00], [ 2.23819886e+01, -4.41699255e+01], [-2.55813682e+01, -8.08036828e+00], [-2.71818732e+00, -2.46556548e+01], [-3.20580877e+01, -3.35982914e+00], [ 1.74869118e+00, -2.78679628e+01], [-1.04361893e+01, -1.79156325e+01], [-1.23777865e+01, -1.63365883e+01], [-1.66857879e+01, -1.32051927e+01], [-1.70637425e+01, -1.29432699e+01], [-1.14279946e+01, -1.47081336e+01], [-2.11661573e+00, -2.13930938e+01], [-5.86576186e+00, -1.74048397e+01], [-6.44140255e+00, -1.71797090e+01], [-1.65304147e+01, -9.75152197e+00], [-1.91078530e+01, -7.84724924e+00], [-1.22007224e+01, -1.16816450e+01], [-1.87466838e+01, -7.02718282e+00], [-5.70201253e+00, -1.37818367e+01], [-1.45458017e+01, -7.50578867e+00], [-9.89832195e+00, -1.10183904e+01], [-1.04648946e+00, -1.71235274e+01], [-1.18864721e+01, -8.30888676e+00], [-3.92770171e+00, -1.40286469e+01], [-1.13282822e+01, -7.44887080e+00], [ 3.46377080e+00, -1.79332904e+01], [-8.33441304e+00, -9.49578304e+00], [-1.46308641e+01, -5.07395882e+00], [ 3.27479351e+00, -1.78023290e+01], [-8.61369094e+00, -9.21467482e+00], [-8.91846305e+00, -8.01933089e+00], [-1.47897744e+01, -3.57239266e+00], [-1.11557908e+01, -5.07876799e+00], [-5.65372529e+00, -9.09602989e+00], [-1.01846740e+01, -8.26808341e-01], [ 1.55364218e+00, -9.16579658e+00], [-2.82260299e+00, -6.10521405e+00], [-5.80809610e+00, -3.88896958e+00], [ 5.48888047e-01, -8.41247039e+00], [-1.22246152e+01, 7.31471686e-01], [-6.66268195e+00, -3.38430938e+00], [ 7.19240500e-02, -8.16815429e+00], [-1.64125485e+01, 3.66425023e+00], [-1.00856646e+01, -8.09201640e-01], [ 4.85688946e+00, -1.15438660e+01], [-8.89193316e+00, -1.69348923e+00], [ 9.83539194e-01, -5.23990787e+00], [ 4.86176170e+00, -7.83995054e+00], [ 7.11270380e+00, -8.32949817e+00], [-6.91506399e+00, 1.80040810e+00], [-2.65394818e+00, -1.98949328e-01], [-4.20954748e+00, 1.28908296e+00], [-3.00216853e+00, 1.43515694e+00], [-8.56410176e+00, 5.55093801e+00], [-5.48026470e+00, 3.35545766e+00], [ 3.89131456e+00, -3.42960043e+00], [-4.38487724e+00, 2.45040594e+00], [ 1.75269668e+00, -1.89050582e+00], [-7.91561133e+00, 6.26080715e+00], [ 2.87781838e+00, -1.42337407e+00], [-3.38823294e+00, 4.36747619e+00], [-2.79957694e+00, 3.75521081e+00], [ 2.44460212e+00, -1.98745323e-01], [ 7.04863359e+00, -3.27258468e+00], [ 3.48816526e+00, -8.34366932e-01], [-2.22426899e+00, 3.53165883e+00], [-3.85154940e+00, 5.64215388e+00], [-4.72819210e+00, 6.36777396e+00], [-9.73190228e+00, 9.92291729e+00], [-5.42390087e+00, 6.79152170e+00], [-2.09088611e+00, 4.36498207e+00], [ 8.17442075e+00, -2.80861035e+00], [-4.25963743e+00, 7.37477932e+00], [ 2.47463584e+00, 2.59251312e+00], [ 4.82458735e+00, 2.12057220e+00], [ 6.20762879e+00, 1.10374449e+00], [-7.60943758e+00, 1.08833082e+01], [ 7.61239435e+00, -1.32464390e-01], [-4.16446458e+00, 1.09285479e+01], [ 7.02864384e+00, 2.76306274e+00], [ 1.05887794e+01, 3.26423711e-01], [-4.06578789e+00, 1.09477334e+01], [ 1.08816716e+01, 2.49633076e+00], [ 3.70086253e+01, -1.65715723e+01], [-8.11136329e+00, 1.77213173e+01], [ 2.81052680e+01, -8.77793015e+00], [-6.58656288e+00, 1.77071451e+01], [ 4.11409310e+01, -1.71194125e+01], [-2.84744243e+01, 3.35759969e+01], [ 2.57986982e+01, -5.90344419e+00], [-2.77787156e+01, 3.31522492e+01], [ 2.56097209e+01, -5.77248278e+00], [-3.38202552e+00, 1.67305440e+01], [ 2.45664573e+01, -3.71778619e+00], [-2.46431207e+01, 3.35286461e+01], [ 3.88951938e+01, -1.29275282e+01], [-2.76062242e+01, 3.55223520e+01], [ 2.65377888e+01, -3.92464688e+00], [-1.99354738e+01, 3.13366002e+01], [ 2.57738030e+01, -2.15105852e+00], [-3.04577022e+01, 3.99894160e+01], [ 4.34352216e+01, -1.37857614e+01], [ 3.12158982e+00, 1.69136141e+01], [ 3.93081209e+01, -9.53558446e+00], [-1.77314505e+01, 3.33997445e+01], [ 4.85124137e+01, -1.49884086e+01], [-3.21370344e+00, 2.26383874e+01], [ 2.96786219e+01, -1.26656692e+00], [-1.09831970e+01, 2.96462611e+01], [ 4.24651072e+01, -9.37699000e+00], [-1.66143722e+01, 3.36959653e+01], [ 3.99482020e+01, -7.57439386e+00], [-1.44449887e+01, 3.21036643e+01], [ 3.07053996e+01, -8.20199299e-01], [-1.81559580e+01, 3.47921267e+01], [ 4.30022716e+01, -9.72140398e+00], [-2.80566583e+01, 4.20925098e+01], [ 3.37892367e+01, -3.01567965e+00], [-2.74897264e+01, 4.16996255e+01], [ 2.98591898e+01, -1.46206864e-01], [-4.33395289e+00, 2.58751711e+01], [ 3.84968836e+01, -5.20772553e+00], [-1.97451613e+01, 3.98647912e+01], [ 4.71562681e+01, -9.06797167e+00], [-6.74276964e+00, 3.51063622e+01], [ 3.44177041e+01, 5.15049060e+00], [-9.35838475e+00, 3.83104267e+01], [ 5.12934461e+01, -5.91189643e+00], [-2.13349996e+00, 3.43142059e+01], [ 2.69184137e+01, 1.31317350e+01], [-1.38938730e+01, 4.28741541e+01], [ 3.71064352e+01, 5.72115470e+00], [-1.54959919e+01, 4.40719921e+01], [ 4.89567761e+01, -2.98736153e+00], [-1.55726450e+01, 4.52525005e+01], [ 4.59635392e+01, 4.77046978e-01], [-1.51348227e+01, 4.48920585e+01], [ 3.31679791e+01, 9.84194894e+00], [-1.31666956e+01, 4.96388560e+01], [ 5.11054712e+01, 2.87979602e+00], [ 7.49379714e+00, 3.96595776e+01], [ 5.08831894e+01, 8.01446456e+00], [-6.98086768e+00, 5.14044048e+01], [ 5.36107627e+01, 7.28217954e+00], [ 1.34171991e+01, 3.77829380e+01], [ 6.10841598e+01, 3.05805695e+00], [ 3.47599286e-01, 4.97106312e+01], [ 3.98450882e+01, 2.10558529e+01], [-1.52051598e+00, 5.36421714e+01], [ 5.28730074e+01, 1.39625346e+01], [ 4.51838126e+00, 4.92820743e+01], [ 4.10527667e+01, 2.26210019e+01], [-1.19688712e+00, 6.58449930e+01], [ 6.30973366e+01, 1.88649731e+01], [ 6.56619775e+00, 6.87444357e+01], [ 5.83525153e+01, 3.10175417e+01], [ 1.99080013e+01, 6.64461080e+01], [ 5.85208042e+01, 3.83460389e+01], [ 2.09791300e+01, 7.93764054e+01], [ 7.24476934e+01, 4.18113364e+01]])
pca_df = pca.transform(data)
넘파이 어레이로 나온 것을 데이터 프레임으로 보기좋게 바꾸자.
pd.DataFrame(pca_df)
0 | 1 | |
---|---|---|
0 | -31.869945 | -33.001252 |
1 | 0.764494 | -56.842901 |
2 | -57.408276 | -13.124961 |
3 | -2.168543 | -53.478590 |
4 | -32.174085 | -30.388412 |
... | ... | ... |
195 | 58.352515 | 31.017542 |
196 | 19.908001 | 66.446108 |
197 | 58.520804 | 38.346039 |
198 | 20.979130 | 79.376405 |
199 | 72.447693 | 41.811336 |
200 rows × 2 columns
컬럼이름을 넣어주자.
pd.DataFrame(pca_df, columns=['PC1', 'PC2'])
PC1 | PC2 | |
---|---|---|
0 | -31.869945 | -33.001252 |
1 | 0.764494 | -56.842901 |
2 | -57.408276 | -13.124961 |
3 | -2.168543 | -53.478590 |
4 | -32.174085 | -30.388412 |
... | ... | ... |
195 | 58.352515 | 31.017542 |
196 | 19.908001 | 66.446108 |
197 | 58.520804 | 38.346039 |
198 | 20.979130 | 79.376405 |
199 | 72.447693 | 41.811336 |
200 rows × 2 columns
pca_df = pd.DataFrame(pca_df, columns=['PC1', 'PC2'])
pca_df
PC1 | PC2 | |
---|---|---|
0 | -31.869945 | -33.001252 |
1 | 0.764494 | -56.842901 |
2 | -57.408276 | -13.124961 |
3 | -2.168543 | -53.478590 |
4 | -32.174085 | -30.388412 |
... | ... | ... |
195 | 58.352515 | 31.017542 |
196 | 19.908001 | 66.446108 |
197 | 58.520804 | 38.346039 |
198 | 20.979130 | 79.376405 |
199 | 72.447693 | 41.811336 |
200 rows × 2 columns
컬럼을 pca을 통해 2개로 줄였으니, scatter 로 그래프를 그려보자. 컬럼이 4개에서 2개로 줄었음으로 원래의 각 정보보다는 정보력이 떨어졌을 것이다.
plt.figure(figsize=(20,10))
sns.scatterplot(x = pca_df['PC1'], y = pca_df['PC2'], hue = model.labels_, palette = 'Set2', s = 100)
<AxesSubplot:xlabel='PC1', ylabel='PC2'>
pca로 전처리한것을 시각화로 살펴보았다. 하지만 이것은 왜곡된 정보가 있었기 때문에 좋은 모델이라고는 볼 수 없다. 참고용으로 보자
- KMeans Cluster 에 대해 알아보았고, 최적의 k 값을 구하는 방법도 알아보았다. 그렇게해서 나온 결과를 어떤식으로 해석할 수 있는지 알아보았다.
- KMeans Cluster을 하면, 데이터기반으로 가장 특색이 뚜렷한 비슷한 무리들끼리 묶이기 때문에 좀 더 많은 인사이트를 얻을 수 있다.
- 출처 : 파이썬을 활용한 이커머스 데이터 분석
'파이썬을 활용한 이커머스 데이터 분석' 카테고리의 다른 글
Chapter09.NLP(상품리뷰분석) (0) | 2021.07.11 |
---|---|
Chapter08.Times Series 쇼핑몰 매출 예측 (Times Series) (0) | 2021.07.08 |
Chapter.06 프로모션 효율 예측 (Random Forest) (0) | 2021.06.13 |
Chapter05.구매 요인 분석(Dicision Tree) (0) | 2021.06.12 |
Chapter04.KNN (0) | 2021.06.10 |