使用sklearn做kmeans聚类分析-[使用sparse matrix]

作者:luozhipeng   发表日期:2015-11-30  浏览:2,753次


from sklearn.cluster import KMeans
from scipy.sparse import csr_matrix, coo_matrix
from sklearn.externals import joblib
import datetime

dir = "C:/Users/v-zhiplu/Desktop/pace/"

def convert():
    lines = open(dir + "kmeans/cate_center_with5797.tsv");
    data = []
    row = []
    col = []
    i = 0
    
    for line in lines:
        line = line[:-1]
        parts = line.split("\t")
        if len(parts) == 2:
            parts1 = parts[1].split(" ")
            for tmp in parts1:
                parts2 = tmp.split(":")
                if len(parts2) == 2:
                    row.append(i)
                    col.append(int(parts2[0]))
                    data.append(float(parts2[1]))
        i += 1

    coo = coo_matrix((data,(row,col)))
    return csr_matrix(coo)
    

def kmeans(n):
    feat = convert()
    print "start clustering..."
    d1 = datetime.datetime.now()
    clf = KMeans(n_clusters=n)
    clf.fit(feat)
    d2 = datetime.datetime.now()
    
    joblib.dump(clf , dir + "kmeans/km"+str(n)+".pkl") //save model
    print "run times(s): ", (d2-d1).seconds

if __name__=='__main__':
 kmeans(20)

标签:

本文固定链接: http://www.luozhipeng.com/?p=498
转载请注明: luozhipeng 2015-11-30 于 罗志鹏的BLOG 发表

上一篇: :下一篇
返回顶部