PCA + kmeans

先记录一下PCA实战需要用到的安装包(arch下,python2环境)

python2-scikit-learn

python2-numpy

python2-pandas

python2-matplotlib

python2-seaborn

pandas.DataFrame

pandas 数据结构介绍

几个和科学计算数据分析有关的重要的python库:NumpyMatplotlib ,pandas

(之前数字图像处理课程都接触过了orz)

其中matplotlib 主要用于图像绘制

sklearn 是用于机器学习的python 模块

Seaborn也是用于图像绘制

str.fomat() 是 python2语法

format中的变量会按照str中{} 出现的顺序替换

1import matplotlib.pyplot as plt
2import numpy as np
3import pandas as pd
4from sklearn.datasets import fetch_mldata
5from sklearn.decomposition import PCA
6import seaborn as sns
7mnist = fetch_mldata("MNIST original")
X = mnist.data / 255.0
y = mnist.target

#print X.shape, y.shape
1feat_cols = [ 'pixel'+str(i) for i in range(X.shape[1]) ]
2df = pd.DataFrame(X,columns=feat_cols) # transform into some pandas DS named DataFrame
3df['label'] = y  # add a column named 'label',valued by variable y
4df['label'] = df['label'].apply(lambda i: str(i))  # df.apply (some funciton)
5X, y = None, None
6#print 'Size of the dataframe: {}' .format(df.shape)   # str.format() , '{}' is replaced by some value 
# Plot the graph
#print df
1#rndperm = np.random.permutation(df.shape[0])
2#plt.gray()
3#fig = plt.figure( figsize=(16,7) )
4#for i in range(0,30):
5#    ax = fig.add_subplot(3,10,i+1, title='Digit: ' + str(df.loc[rndperm[i],'label']) )
6#    ax.matshow(df.loc[rndperm[i],feat_cols].values.reshape((28,28)).astype(float))
7#plt.show()
#print df
1n_components_pca = 3
2pca = PCA(n_components=n_components_pca)
3pca_result = pca.fit_transform(df[feat_cols].values)
1print 'kkkkk?'
2df['pca-one'] = pca_result[:,0]
3df['pca-two'] = pca_result[:,1] 
4df['pca-three'] = pca_result[:,2]
print 'Explained variation per principal component: {}'.format(pca.explained_variance_ratio_)
1pca_var_explained_df = pd.DataFrame({'principal component':np.arange(1,n_components_pca+1),
2                                    'variance_explained':pca.explained_variance_ratio_})
3print pca_var_explained_df.sum()
1ax = sns.barplot(x='principal component',
2                 y='variance_explained',
3                 data=pca_var_explained_df,
4                 palette="Set1")
5ax.set_title('PCA - Variance explained')
6plt.show()

PCA+kmeans 时间对比:

代码:

1import time
2import numpy as np
3from sklearn.cluster import KMeans
4from sklearn.decomposition import PCA  
5vec_num=120
6data = np.random.rand(10000, vec_num) #生成一个随机数据,样本大小为10000, 特征数为120
7#print (data)
1t0 = time.clock()
2estimator = KMeans(n_clusters=4)#构造聚类器
3estimator.fit(data)#聚类
4print ("kmeans time:",time.clock()-t0)
5inertia = estimator.inertia_ # 某种度量,比如距离平方和,用来check k-means算法的效果
6print ("sum:",inertia/vec_num)
1com_num=10
2pca=PCA(n_components=com_num)
3newdata = pca.fit_transform(data)
4t0 = time.clock()
5estimator.fit(newdata)
6print("kmeans time after PCA:",time.clock()-t0)
7inertia = estimator.inertia_
8print ("sum after PCA",inertia/com_num)