testSet.txt
1.658985 4.285136
-3.453687 3.424321 4.838138 -1.151539 -5.379713 -3.362104 0.972564 2.924086 -3.567919 1.531611 0.450614 -3.302219 -3.487105 -1.724432 2.668759 1.594842 -3.156485 3.191137 3.165506 -3.999838 -2.786837 -3.099354 4.208187 2.984927 -2.123337 2.943366 0.704199 -0.479481 -0.392370 -3.963704 2.831667 1.574018 -0.790153 3.343144 2.943496 -3.357075 -3.195883 -2.283926 2.336445 2.875106 -1.786345 2.554248 2.190101 -1.906020 -3.403367 -2.778288 1.778124 3.880832 -1.688346 2.230267 2.592976 -2.054368 -4.007257 -3.207066 2.257734 3.387564 -2.679011 0.785119 0.939512 -4.023563 -3.674424 -2.261084 2.046259 2.735279 -3.189470 1.780269 4.372646 -0.822248 -2.579316 -3.497576 1.889034 5.190400 -0.798747 2.185588 2.836520 -2.658556 -3.837877 -3.253815
places.txt
Dolphin II 10860 SW Beaverton-Hillsdale Hwy Beaverton, OR 45.486502 -122.788346
Hotties 10140 SW Canyon Rd. Beaverton, OR 45.493150 -122.781021 Pussycats 8666a SW Canyon Road Beaverton, OR 45.498187 -122.766147 Stars Cabaret 4570 Lombard Ave Beaverton, OR 45.485943 -122.800311 Sunset Strip 10205 SW Park Way Beaverton, OR 45.508203 -122.781853 Vegas VIP Room 10018 SW Canyon Rd Beaverton, OR 45.493398 -122.779628 Full Moon Bar and Grill 28014 Southeast Wally Road Boring, OR 45.430319 -122.376304 505 Club 505 Burnside Rd Gresham, OR 45.507621 -122.425553 Dolphin 17180 McLoughlin Blvd Milwaukie, OR 45.399070 -122.618893 Dolphin III 13305 SE McLoughlin BLVD Milwaukie, OR 45.427072 -122.634159 Acropolis 8325 McLoughlin Blvd Portland, OR 45.462173 -122.638846 Blush 5145 SE McLoughlin Blvd Portland, OR 45.485396 -122.646587 Boom Boom Room 8345 Barbur Blvd Portland, OR 45.464826 -122.699212 Bottoms Up 16900 Saint Helens Rd Portland, OR 45.646831 -122.842918 Cabaret II 17544 Stark St Portland, OR 45.519142 -122.482480 Cabaret Lounge 503 W Burnside Portland, OR 45.523094 -122.675528 Carnaval 330 SW 3rd Avenue Portland, OR 45.520682 -122.674206 Casa Diablo 2839 NW St. Helens Road Portland, OR 45.543016 -122.720828 Chantilly Lace 6723 Killingsworth St Portland, OR 45.562715 -122.593078 Club 205 9939 Stark St Portland, OR 45.519052 -122.561510 Club Rouge 403 SW Stark Portland, OR 45.520561 -122.675605 Dancin’ Bare 8440 Interstate Ave Portland, OR 45.584124 -122.682725 Devil’s Point 5305 SE Foster Rd Portland, OR 45.495365 -122.608366 Double Dribble 13550 Southeast Powell Boulevard Portland, OR 45.497750 -122.524073 Dream on Saloon 15920 Stark St Portland, OR 45.519142 -122.499672 DV8 5003 Powell Blvd Portland, OR 45.497498 -122.611177 Exotica 240 Columbia Blvd Portland, OR 45.583048 -122.668350 Frolics 8845 Sandy Blvd Portland, OR 45.555384 -122.571475 G-Spot Airport 8654 Sandy Blvd Portland, OR 45.554263 -122.574167 G-Spot Northeast 3400 NE 82nd Ave Portland, OR 45.547229 -122.578746 G-Spot Southeast 5241 SE 72nd Ave Portland, OR 45.484823 -122.589208 Glimmers 3532 Powell Blvd Portland, OR 45.496918 -122.627920 Golden Dragon Exotic Club 324 SW 3rd Ave Portland, OR 45.520714 -122.674189 Heat 12131 SE Holgate Blvd. Portland, OR 45.489637 -122.538196 Honeysuckle’s Lingerie 3520 82nd Ave Portland, OR 45.548651 -122.578730 Hush Playhouse 13560 Powell Blvd Portland, OR 45.497765 -122.523985 JD’s Bar & Grill 4523 NE 60th Ave Portland, OR 45.555811 -122.600881 Jody’s Bar And Grill 12035 Glisan St Portland, OR 45.526306 -122.538833 Landing Strip 6210 Columbia Blvd Portland, OR 45.595042 -122.728825 Lucky Devil Lounge 633 SE Powell Blvd Portland, OR 45.501585 -122.659310
#-*- coding: utf-8 -*- '''Created on Feb 16, 2011k Means Clustering for Ch10 of Machine Learning in Action@author: Peter Harrington'''from numpy import *#读数据def loadDataSet(fileName): dataMat = [] #创建列表。存储读取的数据 fr = open(fileName) for line in fr.readlines(): #读每一行 line1=line.strip(); #删头尾空白 curLine = line1.split('\t') #以\t为切割,返回一个list列表 fltLine = map(float,curLine)#str 转成 float dataMat.append(fltLine) #将元素加入到列表尾 return dataMat #算距离def distEclud(vecA, vecB): #两个向量间欧式距离 return sqrt(sum(power(vecA - vecB, 2))) #la.norm(vecA-vecB)#初始化聚类中心def randCent(dataSet, k): #特征维度 n = shape(dataSet)[1] #创建聚类中心的矩阵 k x n centroids = mat(zeros((k,n))) #遍历n维特征 for j in range(n): #第j维特征属性值min ,1x1矩阵 minJ = min(dataSet[:,j]) #区间值max-min。float数值 rangeJ = float(max(dataSet[:,j]) - minJ) #第j维,每次随机生成k个中心 centroids[:,j] = mat(minJ + rangeJ * random.rand(k,1)) return centroids#k-means算法 (#默认欧式距离。初始中心点方法randCent()) def kMeans(dataSet, k, distMeas=distEclud, createCent=randCent): m = shape(dataSet)[0] #样本总数 #分配样本到近期的簇:存[簇序号,距离的平方] clusterAssment = mat(zeros((m,2))) #step1:#初始化聚类中心 centroids = createCent(dataSet, k) clusterChanged = True #全部样本分配结果不再改变,迭代终止 while clusterChanged: clusterChanged = False #step2:分配到近期的聚类中心相应的簇中 for i in range(m): minDist = inf; minIndex = -1 #对于每一个样本,定义最小距离 for j in range(k): #计算每一个样本与k个中心点距离 distJI = distMeas(centroids[j,:],dataSet[i,:]) if distJI < minDist: minDist = distJI; minIndex = j #获取最小距离,及相应的簇序号 if clusterAssment[i,0] != minIndex: clusterChanged = True clusterAssment[i,:] = minIndex,minDist**2 #分配样本到近期的簇 print 'centroids=',centroids #step3:更新聚类中心 for cent in range(k):#样本分配结束后。又一次计算聚类中心 #获取该簇全部的样本点 ptsInClust = dataSet[nonzero(clusterAssment[:,0].A==cent)[0]] #更新聚类中心:axis=0沿列方向求均值 centroids[cent,:] = mean(ptsInClust, axis=0) return centroids, clusterAssment#二分kmeans def biKmeans(dataSet, k, distMeas=distEclud): m = shape(dataSet)[0] clusterAssment = mat(zeros((m,2))) #全部样本看成一个簇,求均值 centroid0 = mean(dataSet, axis=0).tolist()[0]#axis=0按列,matrix->list centList =[centroid0] #create a list with one centroid for j in range(m): #计算初始总误差SSE clusterAssment[j,1] = distMeas(mat(centroid0), dataSet[j,:])**2 #当簇数len(marksamples): print 'sorry,your k is too large,please add length of the marksample!' return 1 #绘全部样本 for i in range(num): markindex=int(clusterAssment[i,0])#矩阵形式转为int值, 簇序号 #特征维相应坐标轴x,y。样本图形标记及大小 plt.plot(dataSet[i,0],dataSet[i,1],marksamples[markindex],markersize=6) #绘中心点 markcentroids=['dr','db','dg','dk','^b','sk',' len(marksamples): print 'sorry,your k is too large,please add length of the marksample!' return 1 #绘全部样本 for i in range(num): markindex=int(clusterAssment[i,0])#矩阵形式转为int值, 簇序号 #特征维相应坐标轴x,y;样本图形标记及大小 plt.plot(dataSet[i,0],dataSet[i,1],marksamples[markindex],markersize=6) #绘中心点 markcentroids=['dr','db','dg','dk','^b','sk',' ', '<'] axprops = dict(xticks=[], yticks=[]) ax0=fig.add_axes(rect, label='ax0', **axprops) imgP = plt.imread('Portland.png') ax0.imshow(imgP) ax1=fig.add_axes(rect, label='ax1', frameon=False) for i in range(numClust): ptsInCurrCluster = datMat[nonzero(clustAssing[:,0].A==i)[0],:] markerStyle = scatterMarkers[i % len(scatterMarkers)] ax1.scatter(ptsInCurrCluster[:,0].flatten().A[0], ptsInCurrCluster[:,1].flatten().A[0], marker=markerStyle, s=90) ax1.scatter(myCentroids[:,0].flatten().A[0], myCentroids[:,1].flatten().A[0], marker='+', s=300) plt.show()if __name__=='__main__': # #=====显示原始数据# # #获取样本数据# datamat=mat(loadDataSet('testSet.txt'))# #样本的个数和特征维数# num,dim=shape(datamat)# marksamples=['ok'] #样本图形标记# for i in range(num):# plt.plot(datamat[i,0],datamat[i,1],marksamples[0],markersize=6)# plt.title('dataset') #标题# plt.show()### #=====kmeans聚类## k=4 #用户定义聚类数## # 获取样本数据## datamat=mat(loadDataSet('testSet.txt'))## run_num=8 #循环多次看多次的聚类效果## for i in range(run_num): #可循环多次看效果图## mycentroids,clusterAssment=kMeans(datamat,k)## # 画图显示## datashow(datamat,k,mycentroids,clusterAssment) ###二分kmeans datamat2=mat(loadDataSet('testSet.txt')) k= 4 for i in range(1): #能够循环多次看效果图 centlist,mynewassments=biKmeans(datamat2,k) datashow(datamat2,k,centlist,mynewassments)
#-*- coding: utf-8 -*- from numpy import*from matplotlib import pyplot as pltimport kMeans#######################################################每次划分都显示一下#二分kmeans #def biKmeans(dataSet, k, distMeas=distEclud):dataSet=mat(kMeans.loadDataSet('testSet.txt'))k=4distMeas=kMeans.distEcludm = shape(dataSet)[0]clusterAssment = mat(zeros((m,2)))#全部样本看成一个簇,求均值centroid0 = mean(dataSet, axis=0).tolist()[0]#axis=0按列,matrix->listcentList =[centroid0] #create a list with one centroidfor j in range(m): #计算初始总误差SSE clusterAssment[j,1] = distMeas(mat(centroid0), dataSet[j,:])**2kMeans.datashow(dataSet,len(centList),mat(centList),clusterAssment) #当簇数