`关键词` ： 机器学习 ， 聚类算法 ， 数据集合

## 一、基础类型

### 1、月牙形数据集合

```from headm import *
import numpy as np
pltgif = PlotGIF()
def moon2Data(datanum):
x1 = linspace(-3, 3, datanum)
noise = np.random.randn(datanum) * 0.15
y1 = -square(x1) / 3 + 4.5 + noise
x2 = linspace(0, 6, datanum)
noise = np.random.randn(datanum) * 0.15
y2 = square(x2 - 3) / 3 + 0.5 + noise
plt.clf()
plt.axis([-3.5, 6.5, -.5, 5.5])
plt.scatter(x1, y1, s=10)
plt.scatter(x2, y2, s=10)
plt.draw()
plt.pause(.1)
pltgif.append(plt)
for _ in range(20):
moon2Data(300)
pltgif.save(r'd:\temp\GIF1.GIF')```

### 2、方形数据集

```from headm import *
import numpy as np
pltgif = PlotGIF()
def moon2Data(datanum):
x = np.random.rand(datanum, 2)
condition1 = x[:, 1] <= x[:, 0]
condition2 = x[:, 1] <= (1-x[:, 0])
index1 = np.where(condition1 & condition2)
x1 = x[index1]
x = np.delete(x, index1, axis=0)
index2 = np.where(x[:, 0] <= 0.5)
x2 = x[index2]
x3 = np.delete(x, index2, axis=0)
plt.clf()
plt.scatter(x1[:, 0], x1[:, 1], s=10)
plt.scatter(x2[:, 0], x2[:, 1], s=10)
plt.scatter(x3[:, 0], x3[:, 1], s=10)
plt.draw()
plt.pause(.1)
pltgif.append(plt)
for _ in range(20):
moon2Data(1000)
pltgif.save(r'd:\temp\GIF1.GIF')```

### 3、螺旋形数据集合

```from headm import *
import numpy as np
pltgif = PlotGIF()
def randData(datanum):
t = 1.5 * pi * (1+3*random.rand(1, datanum))
x = t * cos(t)
y = t * sin(t)
X = concatenate((x,y))
X += 0.7 * random.randn(2, datanum)
X = X.T
norm = plt.Normalize(y.min(), y.max())
plt.clf()
plt.scatter(X[:, 0], X[:, 1], s=10, c=norm(X[:,0]), cmap='viridis')
plt.axis([-20, 21, -20, 16])
plt.draw()
plt.pause(.1)
pltgif.append(plt)
for _ in range(20):
randData(1000)
pltgif.save(r'd:\temp\GIF1.GIF')```

```from headm import *
import numpy as np
pltgif = PlotGIF()
def randData(datanum, delta):
t = 1.5 * pi * (1+3*random.rand(1, datanum))
x = t * cos(t)
y = t * sin(t)
X = concatenate((x,y))
X += delta * random.randn(2, datanum)
X = X.T
norm = plt.Normalize(y.min(), y.max())
plt.clf()
plt.scatter(X[:, 0], X[:, 1], s=10, c=norm(X[:,0]), cmap='viridis')
plt.axis([-20, 21, -20, 16])
plt.draw()
plt.pause(.1)
pltgif.append(plt)
for i in range(30):
randData(1000, i / 20)
for i in range(30):
randData(1000, (30-i-1) / 20)
pltgif.save(r'd:\temp\GIF1.GIF')```

## 一、基础数据集

### 1、点簇形数据集合

```from headm import *
from sklearn.datasets import make_blobs
pltgif = PlotGIF()
def randData(datanum):
x1,y1 = make_blobs(n_samples=datanum, n_features=2, centers=3, random_state=random.randint(0, 1000))
plt.clf()
plt.scatter(x1[:,0], x1[:, 1], c=y1, s=10)
plt.draw()
plt.pause(.1)
pltgif.append(plt)
for _ in range(20):
randData(300)
pltgif.save(r'd:\temp\gif1.gif')```

```plt.scatter(x1[y1==0][:,0], x1[y1==0][:,1], s=10)
plt.scatter(x1[y1==1][:,0], x1[y1==1][:,1], s=10)
plt.scatter(x1[y1==2][:,0], x1[y1==2][:,1], s=10)```

### 2、线簇形数据集合

```transformation = [[0.60834549, -0.63667341], [-0.40887718, 0.85253229]]
x1 = dot(x1, transformation)```

```a = array(transformation)
w, v = linalg.eig(a)
printf(w, v)```

### 3、环形数据集合

```from headm import *
from sklearn.datasets import make_circles
pltgif = PlotGIF()
def randData(datanum):
x1,y1 = make_circles(n_samples=datanum, noise=0.07, random_state=random.randint(0, 1000), factor=0.6)
plt.clf()
plt.scatter(x1[y1==0][:,0], x1[y1==0][:,1], s=10)
plt.scatter(x1[y1==1][:,0], x1[y1==1][:,1], s=10)
plt.axis([-1.2, 1.2, -1.2, 1.2])
plt.draw()
plt.pause(.1)
pltgif.append(plt)
for _ in range(20):
randData(1000)
pltgif.save(r'd:\temp\gif1.gif')```

### 4、月牙数据集合

```from headm import *
from sklearn.datasets import make_moons
pltgif = PlotGIF()
def randData(datanum):
x1,y1 = make_moons(n_samples=datanum, noise=0.07, random_state=random.randint(0, 1000))
plt.clf()
plt.scatter(x1[y1==0][:,0], x1[y1==0][:,1], s=10)
plt.scatter(x1[y1==1][:,0], x1[y1==1][:,1], s=10)
plt.axis([-1.5, 2.5, -1, 1.5])
plt.draw()
plt.pause(.1)
pltgif.append(plt)
for _ in range(20):
randData(1000)
pltgif.save(r'd:\temp\gif1.gif')```

## ※测试结论 ※

`sklearn` 里面还有好多函数来自定制数据，除此之外还可以使用 `numpy` 生成，然后通过高级索引进行划分，最好结合着 `matplotlib` 中的 `cmap` 来做颜色映射，这样可以做出好玩又好看的数据集。