from sklearn.cluster import KMeans from scipy.sparse import csr_matrix, coo_matrix from sklearn.externals import joblib import datetime dir = "C:/Users/v-zhiplu/Desktop/pace/" def convert(): lines = open(dir + "kmeans/cate_center_with5797.tsv"); data = [] row = [] col = [] i = 0 for line in lines: line = line[:-1] parts = line.split("\t") if len(parts) == 2: parts1 = parts[1].split(" ") for tmp in parts1: parts2 = tmp.split(":") if len(parts2) == 2: row.append(i) col.append(int(parts2[0])) data.append(float(parts2[1])) i += 1 coo = coo_matrix((data,(row,col))) return csr_matrix(coo) def kmeans(n): feat = convert() print "start clustering..." d1 = datetime.datetime.now() clf = KMeans(n_clusters=n) clf.fit(feat) d2 = datetime.datetime.now() joblib.dump(clf , dir + "kmeans/km"+str(n)+".pkl") //save model print "run times(s): ", (d2-d1).seconds if __name__=='__main__': kmeans(20)
本文固定链接: http://www.luozhipeng.com/?p=498
转载请注明: luozhipeng 2015-11-30 于 罗志鹏的BLOG 发表