import math
from typing import Optional
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from clustviz.agglomerative import dist_mat_gen
from collections import Counter, OrderedDict
from copy import deepcopy
import random
from clustviz.utils import dist1, convert_colors, chernoffBounds, flatten_list, cluster_points, \
COLOR_DICT, CURE_REPS_COLORS, FONTSIZE_BIGGER, annotate_points, build_initial_matrices, draw_rectangle_or_encircle
[docs]def point_plot_mod2(
X: np.ndarray,
CURE_df: pd.DataFrame,
reps: list,
level_txt: float,
level2_txt: float = None,
par_index=None,
u=None,
u_cl=None,
initial_ind=None,
last_reps: dict = None,
not_sampled=None,
not_sampled_ind=None,
n_rep_fin=None,
):
"""
Scatter-plot of input data points, colored according to the cluster they belong to.
A rectangle with red borders is displayed around the last merged cluster; representative points
of last merged cluster are also plotted in red, along with the center of mass, plotted as a
red cross. The current number of clusters and current distance are also displayed in the right
upper corner.
:param X: input data array.
:param CURE_df: input dataframe built by CURE algorithm, listing the cluster and the x and y
coordinates of each point.
:param reps: list of the coordinates of representative points.
:param level_txt: distance at which current merging occurs displayed in the upper right corner.
:param level2_txt: incremental distance (not used).
:param par_index: partial index to take the shuffling of indexes into account.
:param u: first cluster to be merged.
:param u_cl: second cluster to be merged.
:param initial_ind: initial partial index.
:param last_reps: dictionary of last representative points.
:param not_sampled: coordinates of points that have not been initially sampled, in the large dataset version.
:param not_sampled_ind: indexes of not_sampled point_indices.
:param n_rep_fin: number of representatives to use for each cluster in the final assignment phase in the large
dataset version.
:return: if par_index is not None, returns the new indexes of par_index.
"""
# diz is used to take the shuffling of data into account, e.g. if the first row doesn't
# correspond to point 0: this is useful for the large dataset version of CURE, where data points
# are randomly sampled, but the initial indices are kept to be plotted.
if par_index is not None:
diz = dict(zip(par_index, list(range(len(par_index)))))
_, ax = plt.subplots(figsize=(14, 6))
# points that still need to be processed are plotted in lime color
ax.scatter(X[:, 0], X[:, 1], s=300, color="lime", edgecolor="black", zorder=3)
# drops the totally null columns, so that the number of columns goes to 2*(cardinality of biggest cluster)
CURE_df = CURE_df.dropna(1, how="all")
color_dict_rect = convert_colors(COLOR_DICT, alpha=0.3)
# to speed things up, this splits all points inside the clusters' names, and start gives the starting index
# that shows where clusters with more than 1 element start (because they are always appended to CURE_df)
len_ind = [len(i.split("-")) for i in list(CURE_df.index)]
start = np.min([i for i in range(len(len_ind)) if len_ind[i] > 1])
# for each cluster, take the single points composing it and plot them in the appropriate color, if
# necessary taking the labels of par_index into account
for ind, i in enumerate(range(start, len(CURE_df))):
points = cluster_points(CURE_df.iloc[i].name)
if par_index is not None:
X_clust = [X[diz[p], 0] for p in points]
Y_clust = [X[diz[p], 1] for p in points]
else:
points = [int(i) for i in points]
X_clust = [X[p, 0] for p in points]
Y_clust = [X[p, 1] for p in points]
ax.scatter(X_clust, Y_clust, s=350, color=COLOR_DICT[ind % len(COLOR_DICT)], zorder=3)
# last merged cluster, so the last element of matrix CURE_df
points = cluster_points(CURE_df.iloc[-1].name)
# finding the new center of mass the newly merged cluster
if par_index is not None:
points = [diz[p] for p in points]
com = X[points].mean(axis=0)
else:
points = [int(i) for i in points]
com = X[points].mean(axis=0)
# plotting the center of mass, marked with an X
ax.scatter(com[0], com[1], s=400, color="r", marker="X", edgecolor="black", zorder=3)
# plotting representative points in red
x_reps = [i[0] for i in reps]
y_reps = [i[1] for i in reps]
ax.scatter(x_reps, y_reps, s=360, color="r", edgecolor="black", zorder=3)
draw_rectangle_or_encircle(X, points, X_clust, Y_clust, ax, ind)
# adding labels to points in the plot
if initial_ind is not None:
labels = initial_ind
else:
labels = range(len(X))
annotate_points(annotations=labels, points=X, ax=ax)
num_clust = "n° clust: " + str(len(CURE_df))
min_dist = "min_dist: " + str(round(level_txt, 5))
dist_incr = " --- dist_incr: " + str(round(level2_txt, 5)) if level2_txt is not None else ""
title = num_clust + " --- " + min_dist + dist_incr
ax.set_title(title, fontsize=FONTSIZE_BIGGER)
plt.show()
# last phase of the large dataset version
if last_reps is not None:
xmin, xmax = ax.get_xlim()
xwidth = xmax - xmin
assignment_phase_large_cure(X=X, CURE_df=CURE_df, diz=diz, initial_ind=initial_ind,
last_reps=last_reps, not_sampled=not_sampled,
not_sampled_ind=not_sampled_ind, n_rep_fin=n_rep_fin,
xwidth=xwidth)
# if par_index is not None, diz is updated with the last merged cluster and its keys are returned
if par_index is not None:
diz["(" + u + ")-(" + u_cl + ")"] = len(diz)
list_keys_diz = list(diz.keys())
return list_keys_diz
[docs]def assignment_phase_large_cure(X, CURE_df, diz, initial_ind, last_reps, not_sampled,
not_sampled_ind, n_rep_fin, xwidth):
"""
In the last phase of CURE algorithm variation for large datasets, arrows are
displayed from every not sampled point to its closest representative point; moreover, representative
points are surrounded by small circles, to make them more visible. Representative points of different
clusters are plotted in different nuances of red.
:param X: input data array.
:param diz: indexes of data points, to take shuffling into account.
:param CURE_df: input dataframe built by CURE algorithm, listing the cluster and the x and y
coordinates of each point.
:param initial_ind: initial partial index.
:param last_reps: dictionary of last representative points.
:param not_sampled: coordinates of points that have not been initially sampled, in the large dataset version.
:param not_sampled_ind: indexes of not_sampled point_indices.
:param n_rep_fin: number of representatives to use for each cluster in the final assignment phase in the large
dataset version.
:param xwidth: plot width.
"""
fig, ax = plt.subplots(figsize=(14, 6))
# plot all the points in color lime
ax.scatter(X[:, 0], X[:, 1], s=300, color="lime", edgecolor="black")
# find the centers of mass of the clusters using the matrix X to find which points belong to
# which cluster
coms = []
for ind, i in enumerate(range(0, len(CURE_df))):
points = cluster_points(CURE_df.iloc[i].name)
for p in points:
ax.scatter(
X[diz[p], 0],
X[diz[p], 1],
s=350,
color=COLOR_DICT[ind % len(COLOR_DICT)],
)
points = [diz[p] for p in points]
coms.append(X[points].mean(axis=0))
# flattening the last_reps values
flat_reps = flatten_list(list(last_reps.values()))
# plotting the representatives, surrounded by small circles, and the centers of mass, marked with X
for i in range(len(last_reps)):
len_rep = len(list(last_reps.values())[i])
x = [
list(last_reps.values())[i][j][0]
for j in range(min(n_rep_fin, len_rep))
]
y = [
list(last_reps.values())[i][j][1]
for j in range(min(n_rep_fin, len_rep))
]
ax.scatter(
x, y, s=400, color=CURE_REPS_COLORS[i % len(CURE_REPS_COLORS)], edgecolor="black"
)
ax.scatter(
coms[i][0],
coms[i][1],
s=400,
color=CURE_REPS_COLORS[i % len(CURE_REPS_COLORS)],
marker="X",
edgecolor="black",
)
for num in range(min(n_rep_fin, len_rep)):
ax.add_artist(
plt.Circle(
(x[num], y[num]),
xwidth * 0.03,
color=CURE_REPS_COLORS[i % len(CURE_REPS_COLORS)],
fill=False,
linewidth=3,
alpha=0.7,
)
)
ax.scatter(
not_sampled[:, 0],
not_sampled[:, 1],
s=400,
color="lime",
edgecolor="black",
)
# find the closest representative for not sampled points, and draw an arrow connecting the points
# to its closest representative
for ns_point in not_sampled:
dist_int = []
for el in flat_reps:
dist_int.append(dist1(ns_point, el))
ind_min = np.argmin(dist_int)
ax.arrow(
ns_point[0],
ns_point[1],
flat_reps[ind_min][0] - ns_point[0],
flat_reps[ind_min][1] - ns_point[1],
length_includes_head=True,
head_width=0.03,
head_length=0.05,
color="black"
)
# plotting the indexes for each point
annotate_points(annotations=initial_ind, points=X, ax=ax)
if not_sampled_ind is not None:
annotate_points(annotations=not_sampled_ind, points=not_sampled, ax=ax)
plt.show()
[docs]def dist_clust_cure(rep_u: list, rep_v: list) -> float:
"""
Compute the distance of two clusters based on the minimum distance found between the
representatives of one cluster and the ones of the other.
:param rep_u: representatives of the first cluster.
:param rep_v: representatives of the second cluster.
:return: distance between two clusters.
"""
rep_u = np.array(rep_u)
rep_v = np.array(rep_v)
distances = []
for i in rep_u:
for j in rep_v:
distances.append(dist1(i, j))
return np.min(distances)
[docs]def update_mat_cure(mat: pd.DataFrame, i: int, j: int, rep_new: dict, name: str) -> pd.DataFrame:
"""
Update distance matrix of CURE, by computing the new distances from the new representatives.
:param mat: input dataframe built by CURE algorithm, listing the cluster and the x and y
coordinates of each point.
:param i: row index of cluster to be merged.
:param j: column index of cluster to be merged.
:param rep_new: dictionary of new representatives.
:param name: string of the form "(" + u + ")-(" + u_cl + ")", containing the new
name of the newly merged cluster.
:return: updated matrix with new distances
"""
# taking the 2 rows to be updated
x = mat.loc[i]
y = mat.loc[j]
key_lists = list(rep_new.keys())
# update all distances from the new cluster with new representatives
vec = []
for i in range(len(mat)):
vec.append(dist_clust_cure(rep_new[name], rep_new[key_lists[i]]))
# adding new row
mat.loc["(" + x.name + ")-(" + y.name + ")", :] = vec
# adding new column
mat["(" + x.name + ")-(" + y.name + ")"] = vec + [np.inf]
# dropping the old row and the old column
mat = mat.drop([x.name, y.name], 0)
mat = mat.drop([x.name, y.name], 1)
return mat
[docs]def sel_rep(clusters: dict, name: str, c: int, alpha: float) -> list:
"""
Select c representatives of the clusters: first one is the farthest from the centroid,
the others c-1 are the farthest from the already selected representatives. It doesn't use
the old representatives, so it is slower than sel_rep_fast.
:param clusters: dictionary of clusters.
:param name: name of the cluster we want to select representatives from.
:param c: number of representatives we want to extract.
:param alpha: 0<=float<=1, it determines how much the representative points are moved
toward the centroid: 0 means they aren't modified, 1 means that all points
collapse to the centroid.
:return: list of representative points.
"""
# if the cluster has c points or less, just take all of them as representatives and shrink them
# according to the parameter alpha
if len(clusters[name]) <= c:
others = clusters[name]
com = np.mean(clusters[name], axis=0)
for i in range(len(others)):
others[i] = others[i] + alpha * (com - others[i])
return others
# if the cluster has more than c points, use the procedure described in the documentation to pick
# the representative points
else:
others = [] # the representatives
indexes = (
[]
) # their indexes, to avoid picking one point multiple times
points = clusters[name]
com = np.mean(points, axis=0)
# compute distances from the centroid
distances_com = {i: dist1(p, com) for i, p in enumerate(points)}
index = max(distances_com, key=distances_com.get)
indexes.append(index)
others.append(
np.array(points[index])
) # first point is the farthest from the centroid
# selecting the other c-1 points
for _ in range(min(c - 1, len(points) - 1)):
# here we store the distances of the current point from the alredy selected representatives
partial_distances = {str(i): [] for i in range(len(points))}
for i, p in enumerate(points):
if i not in indexes:
for other in others:
partial_distances[str(i)].append([dist1(p, np.array(other))])
partial_distances = dict(
(k, [np.sum(v)]) for k, v in partial_distances.items()
)
index2 = max(partial_distances, key=partial_distances.get)
indexes.append(int(index2))
# other points are the farthest from the already selected representatives
others.append(points[int(index2)])
# perform the shrinking according to the parameter alpha
for i in range(len(others)):
others[i] = others[i] + alpha * (com - others[i])
return others
[docs]def sel_rep_fast(prec_reps: list, clusters: dict, name: str, c: int, alpha: float) -> list:
"""
Select c representatives of the clusters from the previously computed representatives,
so it is faster than sel_rep.
:param prec_reps: list of previously computed representatives.
:param clusters: dictionary of clusters.
:param name: name of the cluster we want to select representatives from.
:param c: number of representatives we want to extract.
:param alpha: 0<=float<=1, it determines how much the representative points are moved
toward the centroid: 0 means they aren't modified, 1 means that all points
collapse to the centroid.
:return: list of representative points.
"""
com = np.mean(clusters[name], axis=0)
# if the cluster has c points or less, just take all of them as representatives and shrink them
# according to the parameter alpha
if len(prec_reps) <= c:
others = prec_reps
for i in range(len(others)):
others[i] = others[i] + alpha * (com - others[i])
return others
# if the cluster has more than c points, use the procedure described in the documentation to pick
# the representative points
else:
others = [] # the representatives
indexes = (
[]
) # their indexes, to avoid picking one point multiple times
points = prec_reps # use old representatives
distances_com = {i: dist1(p, com) for i, p in enumerate(points)}
index = max(distances_com, key=distances_com.get)
indexes.append(index)
others.append(np.array(points[index])) # first point
# selecting the other c-1 points
for step in range(min(c - 1, len(points) - 1)):
# here we store the distances of the current point from the alredy selected representatives
partial_distances = {str(i): [] for i in range(len(points))}
for i in range(len(points)):
if i not in indexes:
for other in others:
partial_distances[str(i)].append(
[dist1(points[i], np.array(other))]
)
partial_distances = dict(
(k, [np.sum(v)]) for k, v in partial_distances.items()
)
index2 = max(partial_distances, key=partial_distances.get)
indexes.append(int(index2))
others.append(
points[int(index2)]
) # other points are the farthest from the already selected representatives
# perform the shrinking according to the parameter alpha
for i in range(len(others)):
others[i] = others[i] + alpha * (com - others[i])
return others
[docs]def cure(
X: np.ndarray,
k: int,
c: int = 3,
alpha: float = 0.1,
plotting: bool = True,
preprocessed_data=None,
partial_index=None,
n_rep_finalclust=None,
not_sampled=None,
not_sampled_ind=None,
):
"""
CURE algorithm: hierarchical agglomerative clustering using representatives. The parameters which default to
None are used for the large dataset variation of CURE.
:param X: input data array.
:param k: desired number of clusters.
:param c: number of representatives for each cluster.
:param alpha: parameter that regulates the shrinking of representative points toward the centroid.
:param plotting: if True, plots all intermediate steps.
:param preprocessed_data: if not None, must be of the form (clusters,representatives,matrix_a,X_dist1), which is used to perform a warm start.
:param partial_index: if not None, it is used as index of the matrix_a, of cluster points and of representatives.
:param n_rep_finalclust: the final representative points used to classify the not_sampled points.
:param not_sampled: points not sampled in the initial phase.
:param not_sampled_ind: indexes of not_sampled points.
:return, rep, a): returns the clusters dictionary, the dictionary of representatives, the matrix a
"""
# starting from raw data
if preprocessed_data is None:
# building a dataframe storing the x and y coordinates of input data points
CURE_df, CURE_df_nonan = build_initial_matrices(X, partial_index)
# initial clusters
if partial_index is not None:
clusters = dict(zip(partial_index, X))
else:
clusters = {str(i): np.array(p) for i, p in enumerate(X)}
# build Xdist
X_dist = dist_mat_gen(CURE_df_nonan)
# initialize representatives
if partial_index is not None:
rep = {partial_index[i]: [X[int(i)]] for i in range(len(X))}
else:
rep = {str(i): [p] for i, p in enumerate(X)}
# just as placeholder for while loop
heap = [1] * len(X_dist)
# store minimum distances between clusters for each iteration
levels = []
# use precomputed data
else:
clusters = preprocessed_data[0]
rep = preprocessed_data[1]
CURE_df = preprocessed_data[2]
X_dist = preprocessed_data[3]
heap = [1] * len(X_dist)
levels = []
# store original index
if partial_index is not None:
initial_index = deepcopy(partial_index)
# while the desired number of clusters has not been reached
while len(heap) > k:
# find minimum value of heap queue, which stores clusters according to the distance from
# their closest cluster
list_argmin = list(X_dist.apply(lambda x: np.argmin(x)).values)
list_min = list(X_dist.min(axis=0).values)
heap = dict(zip(list(X_dist.index), list_min))
heap = dict(OrderedDict(sorted(heap.items(), key=lambda kv: kv[1])))
closest = dict(zip(list(X_dist.index), list_argmin))
# get minimum keys and delete them from heap and closest dictionaries
u = min(heap, key=heap.get)
levels.append(heap[u])
del heap[u]
# u_cl = str(closest[u])
u_cl = X_dist.columns[closest[u]]
del closest[u]
new_cluster = form_new_cluster(clusters, u, u_cl)
# delete old clusters
del clusters[u]
del clusters[u_cl]
# set new name
name = "(" + u + ")" + "-" + "(" + u_cl + ")"
clusters[name] = new_cluster
# update representatives
rep[name] = sel_rep_fast(rep[u] + rep[u_cl], clusters, name, c, alpha)
# update distance matrix
X_dist = update_mat_cure(X_dist, u, u_cl, rep, name)
# delete old representatives
del rep[u]
del rep[u_cl]
dim1 = int(CURE_df.loc[u].notna().sum())
# update the matrix a with the new cluster
CURE_df.loc["(" + u + ")" + "-" + "(" + u_cl + ")", :] = CURE_df.loc[u].fillna(
0
) + CURE_df.loc[u_cl].shift(dim1, fill_value=0)
CURE_df = CURE_df.drop(u, 0)
CURE_df = CURE_df.drop(u_cl, 0)
if plotting is True:
# in the large dataset version of CURE
if partial_index is not None:
# only in last step of large dataset version of CURE
if (
(len(heap) == k)
and (not_sampled is not None)
and (not_sampled_ind is not None)
):
# take random representative points from the final representatives
final_reps = {
k: random.sample(v, min(n_rep_finalclust, len(v)))
for k, v in rep.items()
}
partial_index = point_plot_mod2(
X=X,
CURE_df=CURE_df,
reps=rep[name],
level_txt=levels[-1],
par_index=partial_index,
u=u,
u_cl=u_cl,
initial_ind=initial_index,
last_reps=final_reps,
not_sampled=not_sampled,
not_sampled_ind=not_sampled_ind,
n_rep_fin=n_rep_finalclust,
)
# in the intermediate steps of the large dataset version
else:
partial_index = point_plot_mod2(
X=X,
CURE_df=CURE_df,
reps=rep[name],
level_txt=levels[-1],
par_index=partial_index,
u=u,
u_cl=u_cl,
initial_ind=initial_index,
)
else:
point_plot_mod2(X, CURE_df, rep[name], levels[-1])
return clusters, rep, CURE_df
[docs]def plot_results_cure(clust: dict) -> None:
"""
Scatter plot of data points, colored according to the cluster they belong to, after performing
CURE algorithm.
:param clust: output of CURE algorithm, dictionary of the form cluster_labels+point_indices: coords of points
"""
fig, ax = plt.subplots(figsize=(14, 6))
for v in clust.values():
v = np.array(v)
if v.ndim > 1:
ax.scatter(v[:, 0], v[:, 1], s=300)
else:
ax.scatter(v[0], v[1], s=300)
plt.show()
[docs]def dist_mat_gen_cure(reps: dict) -> pd.DataFrame:
"""
Build distance matrix for CURE algorithm, using the dictionary of representatives.
:param reps: dictionary of representative points, the only ones used to compute distances
between clusters.
:return: distance matrix as dataframe
"""
distance_matrix = pd.DataFrame()
ind = list(reps.keys())
k = 0
for i in ind:
for j in ind[k:]:
if i != j:
a = reps[i]
b = reps[j]
distance_matrix.loc[i, j] = dist_clust_cure(a, b)
distance_matrix.loc[j, i] = distance_matrix.loc[i, j]
else:
distance_matrix.loc[i, j] = np.inf
k += 1
distance_matrix = distance_matrix.fillna(np.inf)
return distance_matrix
[docs]def cure_sample_part(
X: np.ndarray,
k: int,
c: int = 3,
alpha: float = 0.3,
u_min: Optional[int] = None,
f: float = 0.3,
d: float = 0.02,
p: Optional[int] = None,
q: Optional[int] = None,
n_rep_finalclust: Optional[int] = None,
plotting: bool = True,
):
"""
CURE algorithm variation for large datasets.
Partition the sample space into p partitions, each of size len(X)/p, then partially cluster each
partition until the final number of clusters in each partition reduces to n/(pq). Then run a second
clustering pass on the n/q partial clusters for all the partitions.
:param X: input data array.
:param k: desired number of clusters.
:param c: number of representatives for each cluster.
:param alpha: parameter that regulates the shrinking of representative points toward the centroid.
:param u_min: size of the smallest cluster u.
:param f: percentage of cluster points (0 <= f <= 1) we would like to have in the sample.
:param d: (0 <= d <= 1) the probability that the sample contains less than f*|u| points of cluster u is less than d.
:param p: the number of partitions.
:param q: the number >1 such that each partition reduces to n/(pq) clusters.
:param n_rep_finalclust: number of representatives to use in the final assignment phase.
:param plotting: if True, plots all intermediate steps.
:return, rep, mat_a): returns the clusters dictionary, the dictionary of representatives, the matrix a.
"""
if ((p is None) and (q is not None)) or ((q is None) and (p is not None)):
raise ValueError("p and q must be both specified if not None.")
# choose the parameters suggested by the paper if the user doesnt provide input parameters
if u_min is None:
u_min = round(len(X) / k)
if n_rep_finalclust is None:
n_rep_finalclust = c
_, df_nonan = build_initial_matrices(X)
# this is done to ensure that the algorithm starts even when input params are bad
while True:
print("new f: ", f)
print("new d: ", d)
n = math.ceil(chernoffBounds(u_min=u_min, f=f, N=len(X), k=k, d=d))
if n <= len(df_nonan):
b_sampled = df_nonan.sample(n, random_state=42)
break
else:
if f >= 0.19:
f = f - 0.1
else:
d = d * 2
b_notsampled = df_nonan.loc[
[str(i) for i in range(len(df_nonan)) if str(i) not in b_sampled.index], :
]
# find the best p and q according to the paper
if (p is None) and (q is None):
def g(x):
res = (x[1] - 1) / (x[0] * x[1]) + 1 / (x[1] ** 2)
return res
results = {}
for i in range(2, 15):
for j in range(2, 15):
results[(i, j)] = g([i, j])
p, q = max(results, key=results.get)
print("p: ", p)
print("q: ", q)
if (n / (p * q)) < 2 * k:
print("n/pq is less than 2k, results could be wrong.")
if k * d >= 1:
print("k*d is greater or equal to 1, results could be wrong.")
# form the partitions
lin_sp = np.linspace(0, n, p + 1, dtype="int")
# lin_sp
b_partitions = []
for num_p in range(p):
# try:
b_partitions.append(b_sampled.iloc[lin_sp[num_p]: lin_sp[num_p + 1]])
# except:
# b_partitions.append(b_sampled.iloc[lin_sp[num_p]:])
k_prov = round(n / (p * q))
# perform clustering on each partition separately
partial_clust = []
partial_rep = []
partial_CURE_df = []
for i in range(p):
print("\n")
print(i)
clusters, rep, CURE_df = cure(
b_partitions[i].values,
k=k_prov,
c=c,
alpha=alpha,
plotting=plotting,
partial_index=b_partitions[i].index,
)
partial_clust.append(clusters)
partial_rep.append(rep)
partial_CURE_df.append(CURE_df)
# merging all data into single components
# clusters
clust_tot = {}
for d in partial_clust:
clust_tot.update(d)
# representatives
rep_tot = {}
for d in partial_rep:
rep_tot.update(d)
# mat CURE_df
diz = {i: len(b_partitions[i]) for i in range(p)}
num_freq = Counter(diz.values()).most_common(1)[0][0]
bad_ind = [k for k, v in diz.items() if v != num_freq]
for ind in bad_ind:
partial_CURE_df[ind]["{0}x".format(diz[ind])] = [np.nan] * k_prov
partial_CURE_df[ind]["{0}y".format(diz[ind])] = [np.nan] * k_prov
CURE_df_tot = partial_CURE_df[0].append(partial_CURE_df[1])
for i in range(1, len(partial_CURE_df) - 1):
CURE_df_tot = CURE_df_tot.append(partial_CURE_df[i + 1])
# mat Xdist
X_dist_tot = dist_mat_gen_cure(rep_tot)
# final_clustering
prep_data = [clust_tot, rep_tot, CURE_df_tot, X_dist_tot]
clusters, rep, CURE_df = cure(
b_sampled.values,
k=k,
c=c,
alpha=alpha,
preprocessed_data=prep_data,
partial_index=b_sampled.index,
n_rep_finalclust=n_rep_finalclust,
not_sampled=b_notsampled.values,
plotting=plotting,
not_sampled_ind=b_notsampled.index,
)
return clusters, rep, CURE_df
[docs]def demo_parameters():
"""Four plots showing the effects on the sample size of various parameters."""
plt.figure(figsize=(12, 10))
plt.suptitle("Effects on sample size from different parameters")
def compute_res(u_size, f, N, k, d):
res = k * (
f * N
+ N / u_size * np.log(1 / d)
+ N
/ u_size
* np.sqrt(np.log(1 / d) ** 2 + 2 * f * u_size * np.log(1 / d))
)
return res
ax0 = plt.subplot(2, 2, 1)
u_size = 6000
f = 0.20
N = 20000
k = 4
d = np.linspace(0.0000001, 1, 100)
res = compute_res(u_size, f, N, k, d)
ax0.set_title("u_min: {0}, f:{1}, k:{2}".format(u_size, f, k))
plt.axhline(N, color="r")
plt.plot(d, res)
plt.xlabel("d")
plt.ylabel("sample size")
ax1 = plt.subplot(2, 2, 2)
u_size = 3000
f = 0.2
N = 20000
d = 0.1
k = list(range(1, 13))
ax1.set_title("u_min: {0}, f:{1}, d:{2}".format(u_size, f, d))
res = [
compute_res(u_size, f, N, k[i], d)
for i in range(len(k))
]
plt.axhline(N, color="r")
plt.plot(k, res)
plt.xlabel("k")
ax2 = plt.subplot(2, 2, 3)
u_size = 5000
f = np.linspace(0.00001, 1, 100)
N = 20000
d = 0.1
k = 4
ax2.set_title("u_min: {0}, d:{1}, k:{2}".format(u_size, d, k))
res = compute_res(u_size, f, N, k, d)
plt.axhline(N, color="r")
plt.plot(f, res)
plt.xlabel("f")
plt.ylabel("sample size")
ax3 = plt.subplot(2, 2, 4)
u_size = np.linspace(200, 10000, 30)
f = 0.2
N = 20000
d = 0.1
k = 4
ax3.set_title("f: {0}, d:{1}, k:{2}".format(f, d, k))
res = compute_res(u_size, f, N, k, d)
plt.axhline(N, color="r")
plt.plot(u_size, res)
plt.xlabel("u_min")
plt.show()