Skip to content

Commit 4026f9e

Browse files
committed
更新第三章:数据科学-sklearn, cluster。
1 parent 4e1714f commit 4026f9e

14 files changed

+1571
-720
lines changed

03_data_science/03_scikit-learn/kmeans/plot_cluster_iris.ipynb

Lines changed: 12 additions & 143 deletions
Large diffs are not rendered by default.

03_data_science/03_scikit-learn/kmeans/plot_color_quantization.ipynb

Lines changed: 6 additions & 168 deletions
Large diffs are not rendered by default.

03_data_science/03_scikit-learn/kmeans/plot_kmeans_stability_low_dim_dense.ipynb

Lines changed: 12 additions & 137 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,7 @@
33
{
44
"cell_type": "code",
55
"execution_count": null,
6-
"metadata": {
7-
"collapsed": false
8-
},
6+
"metadata": {},
97
"outputs": [],
108
"source": [
119
"%matplotlib inline"
@@ -42,50 +40,21 @@
4240
{
4341
"cell_type": "markdown",
4442
"metadata": {},
45-
"source": [
46-
"基于经验的k-means初始化方法\n",
47-
"\n",
48-
"评估k-均值初始化的能力,以使算法收敛稳健,如通过聚类惯性的相对标准偏差(即到最近聚类中心的平方距离之和)测量的。\n",
49-
"\n",
50-
"第一个图显示了最佳初始化参数(``KMeans`` or ``MiniBatchKMeans``)和init方法(``init=\"random\"`` or ``init=\"kmeans++\"``)的选择。\n",
51-
"\n",
52-
"第二个图显示了使用``init=\"random\"`` and ``n_init=1``的``MiniBatchKMeans``一次运行结果。这种运行导致一个坏的收敛(局部最优)。\n",
53-
"\n",
54-
"用于评估的数据集是符合高斯分布的2D网格数据。"
55-
]
43+
"source": []
5644
},
5745
{
5846
"cell_type": "code",
5947
"execution_count": 3,
60-
"metadata": {
61-
"collapsed": false
62-
},
48+
"metadata": {},
6349
"outputs": [
6450
{
6551
"name": "stdout",
6652
"output_type": "stream",
6753
"text": [
68-
"Automatically created module for IPython interactive environment\nEvaluation of KMeans with k-means++ init\n"
69-
]
70-
},
71-
{
72-
"name": "stdout",
73-
"output_type": "stream",
74-
"text": [
75-
"Evaluation of KMeans with random init\n"
76-
]
77-
},
78-
{
79-
"name": "stdout",
80-
"output_type": "stream",
81-
"text": [
82-
"Evaluation of MiniBatchKMeans with k-means++ init\n"
83-
]
84-
},
85-
{
86-
"name": "stdout",
87-
"output_type": "stream",
88-
"text": [
54+
"Automatically created module for IPython interactive environment\n",
55+
"Evaluation of KMeans with k-means++ init\n",
56+
"Evaluation of KMeans with random init\n",
57+
"Evaluation of MiniBatchKMeans with k-means++ init\n",
8958
"Evaluation of MiniBatchKMeans with random init\n"
9059
]
9160
},
@@ -114,109 +83,15 @@
11483
"print(__doc__)\n",
11584
"\n",
11685
"# Author: Olivier Grisel <olivier.grisel@ensta.org>\n",
117-
"# License: BSD 3 clause\n",
118-
"\n",
119-
"import numpy as np\n",
120-
"import matplotlib.pyplot as plt\n",
121-
"import matplotlib.cm as cm\n",
122-
"\n",
123-
"from sklearn.utils import shuffle\n",
124-
"from sklearn.utils import check_random_state\n",
125-
"from sklearn.cluster import MiniBatchKMeans\n",
126-
"from sklearn.cluster import KMeans\n",
127-
"\n",
128-
"random_state = np.random.RandomState(0)\n",
129-
"\n",
130-
"# Number of run (with randomly generated dataset) for each strategy so as\n",
131-
"# to be able to compute an estimate of the standard deviation\n",
132-
"n_runs = 5\n",
133-
"\n",
134-
"# k-means models can do several random inits so as to be able to trade\n",
135-
"# CPU time for convergence robustness\n",
136-
"n_init_range = np.array([1, 5, 10, 15, 20])\n",
137-
"\n",
138-
"# Datasets generation parameters\n",
139-
"n_samples_per_center = 100\n",
140-
"grid_size = 3\n",
141-
"scale = 0.1\n",
142-
"n_clusters = grid_size ** 2\n",
143-
"\n",
144-
"\n",
145-
"def make_data(random_state, n_samples_per_center, grid_size, scale):\n",
146-
" random_state = check_random_state(random_state)\n",
147-
" centers = np.array([[i, j]\n",
148-
" for i in range(grid_size)\n",
149-
" for j in range(grid_size)])\n",
150-
" n_clusters_true, n_features = centers.shape\n",
151-
"\n",
152-
" noise = random_state.normal(\n",
153-
" scale=scale, size=(n_samples_per_center, centers.shape[1]))\n",
154-
"\n",
155-
" X = np.concatenate([c + noise for c in centers])\n",
156-
" y = np.concatenate([[i] * n_samples_per_center\n",
157-
" for i in range(n_clusters_true)])\n",
158-
" return shuffle(X, y, random_state=random_state)\n",
159-
"\n",
160-
"# Part 1: Quantitative evaluation of various init methods\n",
161-
"\n",
162-
"plt.figure()\n",
163-
"plots = []\n",
164-
"legends = []\n",
165-
"\n",
166-
"cases = [\n",
167-
" (KMeans, 'k-means++', {}),\n",
168-
" (KMeans, 'random', {}),\n",
169-
" (MiniBatchKMeans, 'k-means++', {'max_no_improvement': 3}),\n",
170-
" (MiniBatchKMeans, 'random', {'max_no_improvement': 3, 'init_size': 500}),\n",
171-
"]\n",
172-
"\n",
173-
"for factory, init, params in cases:\n",
174-
" print(\"Evaluation of %s with %s init\" % (factory.__name__, init))\n",
175-
" inertia = np.empty((len(n_init_range), n_runs))\n",
176-
"\n",
177-
" for run_id in range(n_runs):\n",
178-
" X, y = make_data(run_id, n_samples_per_center, grid_size, scale)\n",
179-
" for i, n_init in enumerate(n_init_range):\n",
180-
" km = factory(n_clusters=n_clusters, init=init, random_state=run_id,\n",
181-
" n_init=n_init, **params).fit(X)\n",
182-
" inertia[i, run_id] = km.inertia_\n",
183-
" p = plt.errorbar(n_init_range, inertia.mean(axis=1), inertia.std(axis=1))\n",
184-
" plots.append(p[0])\n",
185-
" legends.append(\"%s with %s init\" % (factory.__name__, init))\n",
186-
"\n",
187-
"plt.xlabel('n_init')\n",
188-
"plt.ylabel('inertia')\n",
189-
"plt.legend(plots, legends)\n",
190-
"plt.title(\"Mean inertia for various k-means init across %d runs\" % n_runs)\n",
191-
"\n",
192-
"# Part 2: Qualitative visual inspection of the convergence\n",
193-
"\n",
194-
"X, y = make_data(random_state, n_samples_per_center, grid_size, scale)\n",
195-
"km = MiniBatchKMeans(n_clusters=n_clusters, init='random', n_init=1,\n",
196-
" random_state=random_state).fit(X)\n",
197-
"\n",
198-
"plt.figure()\n",
199-
"for k in range(n_clusters):\n",
200-
" my_members = km.labels_ == k\n",
201-
" color = cm.nipy_spectral(float(k) / n_clusters, 1)\n",
202-
" plt.plot(X[my_members, 0], X[my_members, 1], 'o', marker='.', c=color)\n",
203-
" cluster_center = km.cluster_centers_[k]\n",
204-
" plt.plot(cluster_center[0], cluster_center[1], 'o',\n",
205-
" markerfacecolor=color, markeredgecolor='k', markersize=6)\n",
206-
" plt.title(\"Example cluster allocation with a single random init\\n\"\n",
207-
" \"with MiniBatchKMeans\")\n",
208-
"\n",
209-
"plt.show()"
86+
"# License: BSD 3 clause\n"
21087
]
21188
},
21289
{
21390
"cell_type": "code",
21491
"execution_count": null,
21592
"metadata": {},
21693
"outputs": [],
217-
"source": [
218-
""
219-
]
94+
"source": []
22095
}
22196
],
22297
"metadata": {
@@ -228,16 +103,16 @@
228103
"language_info": {
229104
"codemirror_mode": {
230105
"name": "ipython",
231-
"version": 3.0
106+
"version": 3
232107
},
233108
"file_extension": ".py",
234109
"mimetype": "text/x-python",
235110
"name": "python",
236111
"nbconvert_exporter": "python",
237112
"pygments_lexer": "ipython3",
238-
"version": "3.6.6"
113+
"version": "3.8.8"
239114
}
240115
},
241116
"nbformat": 4,
242-
"nbformat_minor": 0
117+
"nbformat_minor": 1
243118
}

03_data_science/03_scikit-learn/kmeans/plot_mini_batch_kmeans.ipynb

Lines changed: 14 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,7 @@
33
{
44
"cell_type": "code",
55
"execution_count": 1,
6-
"metadata": {
7-
"collapsed": false
8-
},
6+
"metadata": {},
97
"outputs": [],
108
"source": [
119
"%matplotlib inline"
@@ -33,24 +31,24 @@
3331
{
3432
"cell_type": "markdown",
3533
"metadata": {},
36-
"source": [
37-
"比较K-Means和MiniBatchKMeans算法\n",
38-
"\n",
39-
"结论:初始化一致的情况下,差别很小。"
40-
]
34+
"source": []
4135
},
4236
{
4337
"cell_type": "code",
4438
"execution_count": 7,
45-
"metadata": {
46-
"collapsed": false
47-
},
39+
"metadata": {},
4840
"outputs": [
4941
{
5042
"name": "stdout",
5143
"output_type": "stream",
5244
"text": [
53-
"Automatically created module for IPython interactive environment\n0\n[False True False ... True True False] [1 0 1 ... 0 0 2] 0\n1\n[ True False True ... False False False] [1 0 1 ... 0 0 2] 1\n2\n[False False False ... False False True] [1 0 1 ... 0 0 2] 2\n"
45+
"Automatically created module for IPython interactive environment\n",
46+
"0\n",
47+
"[False True False ... True True False] [1 0 1 ... 0 0 2] 0\n",
48+
"1\n",
49+
"[ True False True ... False False False] [1 0 1 ... 0 0 2] 1\n",
50+
"2\n",
51+
"[False False False ... False False True] [1 0 1 ... 0 0 2] 2\n"
5452
]
5553
},
5654
{
@@ -175,9 +173,7 @@
175173
"execution_count": null,
176174
"metadata": {},
177175
"outputs": [],
178-
"source": [
179-
""
180-
]
176+
"source": []
181177
}
182178
],
183179
"metadata": {
@@ -189,16 +185,16 @@
189185
"language_info": {
190186
"codemirror_mode": {
191187
"name": "ipython",
192-
"version": 3.0
188+
"version": 3
193189
},
194190
"file_extension": ".py",
195191
"mimetype": "text/x-python",
196192
"name": "python",
197193
"nbconvert_exporter": "python",
198194
"pygments_lexer": "ipython3",
199-
"version": "3.6.6"
195+
"version": "3.8.8"
200196
}
201197
},
202198
"nbformat": 4,
203-
"nbformat_minor": 0
199+
"nbformat_minor": 1
204200
}

0 commit comments

Comments
 (0)