Source code for clustviz.cure

import math
from typing import Optional

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from clustviz.agglomerative import dist_mat_gen
from collections import Counter, OrderedDict
from copy import deepcopy
import random

from clustviz.utils import dist1, convert_colors, chernoffBounds, flatten_list, cluster_points, \
    COLOR_DICT, CURE_REPS_COLORS, FONTSIZE_BIGGER, annotate_points, build_initial_matrices, draw_rectangle_or_encircle


[docs]def point_plot_mod2(
    X: np.ndarray,
    CURE_df: pd.DataFrame,
    reps: list,
    level_txt: float,
    level2_txt: float = None,
    par_index=None,
    u=None,
    u_cl=None,
    initial_ind=None,
    last_reps: dict = None,
    not_sampled=None,
    not_sampled_ind=None,
    n_rep_fin=None,
):
    """
    Scatter-plot of input data points, colored according to the cluster they belong to.
    A rectangle with red borders is displayed around the last merged cluster; representative points
    of last merged cluster are also plotted in red, along with the center of mass, plotted as a
    red cross. The current number of clusters and current distance are also displayed in the right
    upper corner.

    :param X: input data array.
    :param CURE_df: input dataframe built by CURE algorithm, listing the cluster and the x and y
              coordinates of each point.
    :param reps: list of the coordinates of representative points.
    :param level_txt: distance at which current merging occurs displayed in the upper right corner.
    :param level2_txt: incremental distance (not used).
    :param par_index: partial index to take the shuffling of indexes into account.
    :param u: first cluster to be merged.
    :param u_cl: second cluster to be merged.
    :param initial_ind: initial partial index.
    :param last_reps: dictionary of last representative points.
    :param not_sampled: coordinates of points that have not been initially sampled, in the large dataset version.
    :param not_sampled_ind: indexes of not_sampled point_indices.
    :param n_rep_fin: number of representatives to use for each cluster in the final assignment phase in the large
                      dataset version.
    :return: if par_index is not None, returns the new indexes of par_index.
    """
    # diz is used to take the shuffling of data into account, e.g. if the first row doesn't
    # correspond to point 0: this is useful for the large dataset version of CURE, where data points
    # are randomly sampled, but the initial indices are kept to be plotted.
    if par_index is not None:
        diz = dict(zip(par_index, list(range(len(par_index)))))

    _, ax = plt.subplots(figsize=(14, 6))

    # points that still need to be processed are plotted in lime color
    ax.scatter(X[:, 0], X[:, 1], s=300, color="lime", edgecolor="black", zorder=3)

    # drops the totally null columns, so that the number of columns goes to 2*(cardinality of biggest cluster)
    CURE_df = CURE_df.dropna(1, how="all")

    color_dict_rect = convert_colors(COLOR_DICT, alpha=0.3)

    # to speed things up, this splits all points inside the clusters' names, and start gives the starting index
    # that shows where clusters with more than 1 element start (because they are always appended to CURE_df)
    len_ind = [len(i.split("-")) for i in list(CURE_df.index)]
    start = np.min([i for i in range(len(len_ind)) if len_ind[i] > 1])

    # for each cluster, take the single points composing it and plot them in the appropriate color, if
    # necessary taking the labels of par_index into account
    for ind, i in enumerate(range(start, len(CURE_df))):
        points = cluster_points(CURE_df.iloc[i].name)

        if par_index is not None:
            X_clust = [X[diz[p], 0] for p in points]
            Y_clust = [X[diz[p], 1] for p in points]

        else:
            points = [int(i) for i in points]
            X_clust = [X[p, 0] for p in points]
            Y_clust = [X[p, 1] for p in points]

        ax.scatter(X_clust, Y_clust, s=350, color=COLOR_DICT[ind % len(COLOR_DICT)], zorder=3)

    # last merged cluster, so the last element of matrix CURE_df
    points = cluster_points(CURE_df.iloc[-1].name)
    # finding the new center of mass the newly merged cluster
    if par_index is not None:
        points = [diz[p] for p in points]
        com = X[points].mean(axis=0)
    else:
        points = [int(i) for i in points]
        com = X[points].mean(axis=0)

    # plotting the center of mass, marked with an X
    ax.scatter(com[0], com[1], s=400, color="r", marker="X", edgecolor="black", zorder=3)

    # plotting representative points in red
    x_reps = [i[0] for i in reps]
    y_reps = [i[1] for i in reps]
    ax.scatter(x_reps, y_reps, s=360, color="r", edgecolor="black", zorder=3)

    draw_rectangle_or_encircle(X, points, X_clust, Y_clust, ax, ind)

    # adding labels to points in the plot
    if initial_ind is not None:
        labels = initial_ind
    else:
        labels = range(len(X))

    annotate_points(annotations=labels, points=X, ax=ax)

    num_clust = "n° clust: " + str(len(CURE_df))
    min_dist = "min_dist: " + str(round(level_txt, 5))
    dist_incr = "  ---  dist_incr: " + str(round(level2_txt, 5)) if level2_txt is not None else ""

    title = num_clust + " --- " + min_dist + dist_incr

    ax.set_title(title, fontsize=FONTSIZE_BIGGER)

    plt.show()

    # last phase of the large dataset version
    if last_reps is not None:
        xmin, xmax = ax.get_xlim()
        xwidth = xmax - xmin

        assignment_phase_large_cure(X=X, CURE_df=CURE_df, diz=diz, initial_ind=initial_ind,
                                    last_reps=last_reps, not_sampled=not_sampled,
                                    not_sampled_ind=not_sampled_ind, n_rep_fin=n_rep_fin,
                                    xwidth=xwidth)

    # if par_index is not None, diz is updated with the last merged cluster and its keys are returned
    if par_index is not None:
        diz["(" + u + ")-(" + u_cl + ")"] = len(diz)
        list_keys_diz = list(diz.keys())

        return list_keys_diz


[docs]def assignment_phase_large_cure(X, CURE_df, diz, initial_ind, last_reps, not_sampled,
                                not_sampled_ind, n_rep_fin, xwidth):
    """
    In the last phase of CURE algorithm variation for large datasets, arrows are
    displayed from every not sampled point to its closest representative point; moreover, representative
    points are surrounded by small circles, to make them more visible. Representative points of different
    clusters are plotted in different nuances of red.

    :param X: input data array.
    :param diz: indexes of data points, to take shuffling into account.
    :param CURE_df: input dataframe built by CURE algorithm, listing the cluster and the x and y
              coordinates of each point.
    :param initial_ind: initial partial index.
    :param last_reps: dictionary of last representative points.
    :param not_sampled: coordinates of points that have not been initially sampled, in the large dataset version.
    :param not_sampled_ind: indexes of not_sampled point_indices.
    :param n_rep_fin: number of representatives to use for each cluster in the final assignment phase in the large
                      dataset version.
    :param xwidth: plot width.
    """

    fig, ax = plt.subplots(figsize=(14, 6))

    # plot all the points in color lime
    ax.scatter(X[:, 0], X[:, 1], s=300, color="lime", edgecolor="black")

    # find the centers of mass of the clusters using the matrix X to find which points belong to
    # which cluster
    coms = []
    for ind, i in enumerate(range(0, len(CURE_df))):
        points = cluster_points(CURE_df.iloc[i].name)
        for p in points:
            ax.scatter(
                X[diz[p], 0],
                X[diz[p], 1],
                s=350,
                color=COLOR_DICT[ind % len(COLOR_DICT)],
            )
        points = [diz[p] for p in points]
        coms.append(X[points].mean(axis=0))

    # flattening the last_reps values
    flat_reps = flatten_list(list(last_reps.values()))

    # plotting the representatives, surrounded by small circles, and the centers of mass, marked with X
    for i in range(len(last_reps)):
        len_rep = len(list(last_reps.values())[i])

        x = [
            list(last_reps.values())[i][j][0]
            for j in range(min(n_rep_fin, len_rep))
        ]
        y = [
            list(last_reps.values())[i][j][1]
            for j in range(min(n_rep_fin, len_rep))
        ]

        ax.scatter(
            x, y, s=400, color=CURE_REPS_COLORS[i % len(CURE_REPS_COLORS)], edgecolor="black"
        )
        ax.scatter(
            coms[i][0],
            coms[i][1],
            s=400,
            color=CURE_REPS_COLORS[i % len(CURE_REPS_COLORS)],
            marker="X",
            edgecolor="black",
        )

        for num in range(min(n_rep_fin, len_rep)):
            ax.add_artist(
                plt.Circle(
                    (x[num], y[num]),
                    xwidth * 0.03,
                    color=CURE_REPS_COLORS[i % len(CURE_REPS_COLORS)],
                    fill=False,
                    linewidth=3,
                    alpha=0.7,
                )
            )

        ax.scatter(
            not_sampled[:, 0],
            not_sampled[:, 1],
            s=400,
            color="lime",
            edgecolor="black",
        )

    # find the closest representative for not sampled points, and draw an arrow connecting the points
    # to its closest representative
    for ns_point in not_sampled:
        dist_int = []
        for el in flat_reps:
            dist_int.append(dist1(ns_point, el))
        ind_min = np.argmin(dist_int)

        ax.arrow(
            ns_point[0],
            ns_point[1],
            flat_reps[ind_min][0] - ns_point[0],
            flat_reps[ind_min][1] - ns_point[1],
            length_includes_head=True,
            head_width=0.03,
            head_length=0.05,
            color="black"
        )

    # plotting the indexes for each point
    annotate_points(annotations=initial_ind, points=X, ax=ax)

    if not_sampled_ind is not None:
        annotate_points(annotations=not_sampled_ind, points=not_sampled, ax=ax)

    plt.show()


[docs]def dist_clust_cure(rep_u: list, rep_v: list) -> float:
    """
    Compute the distance of two clusters based on the minimum distance found between the
    representatives of one cluster and the ones of the other.

    :param rep_u: representatives of the first cluster.
    :param rep_v: representatives of the second cluster.
    :return: distance between two clusters.
    """
    rep_u = np.array(rep_u)
    rep_v = np.array(rep_v)
    distances = []
    for i in rep_u:
        for j in rep_v:
            distances.append(dist1(i, j))
    return np.min(distances)


[docs]def update_mat_cure(mat: pd.DataFrame, i: int, j: int, rep_new: dict, name: str) -> pd.DataFrame:
    """
    Update distance matrix of CURE, by computing the new distances from the new representatives.

    :param mat: input dataframe built by CURE algorithm, listing the cluster and the x and y
                coordinates of each point.
    :param i: row index of cluster to be merged.
    :param j: column index of cluster to be merged.
    :param rep_new: dictionary of new representatives.
    :param name: string of the form "(" + u + ")-(" + u_cl + ")", containing the new
                 name of the newly merged cluster.
    :return: updated matrix with new distances
    """
    # taking the 2 rows to be updated
    x = mat.loc[i]
    y = mat.loc[j]

    key_lists = list(rep_new.keys())

    # update all distances from the new cluster with new representatives
    vec = []
    for i in range(len(mat)):
        vec.append(dist_clust_cure(rep_new[name], rep_new[key_lists[i]]))

    # adding new row
    mat.loc["(" + x.name + ")-(" + y.name + ")", :] = vec
    # adding new column
    mat["(" + x.name + ")-(" + y.name + ")"] = vec + [np.inf]

    # dropping the old row and the old column
    mat = mat.drop([x.name, y.name], 0)
    mat = mat.drop([x.name, y.name], 1)

    return mat


[docs]def sel_rep(clusters: dict, name: str, c: int, alpha: float) -> list:
    """
    Select c representatives of the clusters: first one is the farthest from the centroid,
    the others c-1 are the farthest from the already selected representatives. It doesn't use
    the old representatives, so it is slower than sel_rep_fast.

    :param clusters: dictionary of clusters.
    :param name: name of the cluster we want to select representatives from.
    :param c: number of representatives we want to extract.
    :param alpha: 0<=float<=1, it determines how much the representative points are moved
                 toward the centroid: 0 means they aren't modified, 1 means that all points
                 collapse to the centroid.
    :return: list of representative points.
    """
    # if the cluster has c points or less, just take all of them as representatives and shrink them
    # according to the parameter alpha
    if len(clusters[name]) <= c:

        others = clusters[name]
        com = np.mean(clusters[name], axis=0)

        for i in range(len(others)):
            others[i] = others[i] + alpha * (com - others[i])

        return others

    # if the cluster has more than c points, use the procedure described in the documentation to pick
    # the representative points
    else:

        others = []  # the representatives
        indexes = (
            []
        )  # their indexes, to avoid picking one point multiple times

        points = clusters[name]
        com = np.mean(points, axis=0)

        # compute distances from the centroid
        distances_com = {i: dist1(p, com) for i, p in enumerate(points)}
        index = max(distances_com, key=distances_com.get)

        indexes.append(index)
        others.append(
            np.array(points[index])
        )  # first point is the farthest from the centroid

        # selecting the other c-1 points
        for _ in range(min(c - 1, len(points) - 1)):
            # here we store the distances of the current point from the alredy selected representatives
            partial_distances = {str(i): [] for i in range(len(points))}
            for i, p in enumerate(points):
                if i not in indexes:
                    for other in others:
                        partial_distances[str(i)].append([dist1(p, np.array(other))])
            partial_distances = dict(
                (k, [np.sum(v)]) for k, v in partial_distances.items()
            )
            index2 = max(partial_distances, key=partial_distances.get)
            indexes.append(int(index2))
            # other points are the farthest from the already selected representatives
            others.append(points[int(index2)])

        # perform the shrinking according to the parameter alpha
        for i in range(len(others)):
            others[i] = others[i] + alpha * (com - others[i])

        return others


[docs]def sel_rep_fast(prec_reps: list, clusters: dict, name: str, c: int, alpha: float) -> list:
    """
    Select c representatives of the clusters from the previously computed representatives,
    so it is faster than sel_rep.

    :param prec_reps: list of previously computed representatives.
    :param clusters: dictionary of clusters.
    :param name: name of the cluster we want to select representatives from.
    :param c: number of representatives we want to extract.
    :param alpha: 0<=float<=1, it determines how much the representative points are moved
                 toward the centroid: 0 means they aren't modified, 1 means that all points
                 collapse to the centroid.
    :return: list of representative points.
    """
    com = np.mean(clusters[name], axis=0)

    # if the cluster has c points or less, just take all of them as representatives and shrink them
    # according to the parameter alpha
    if len(prec_reps) <= c:

        others = prec_reps
        for i in range(len(others)):
            others[i] = others[i] + alpha * (com - others[i])

        return others

    # if the cluster has more than c points, use the procedure described in the documentation to pick
    # the representative points
    else:

        others = []  # the representatives
        indexes = (
            []
        )  # their indexes, to avoid picking one point multiple times

        points = prec_reps  # use old representatives

        distances_com = {i: dist1(p, com) for i, p in enumerate(points)}
        index = max(distances_com, key=distances_com.get)

        indexes.append(index)
        others.append(np.array(points[index]))  # first point

        # selecting the other c-1 points
        for step in range(min(c - 1, len(points) - 1)):
            # here we store the distances of the current point from the alredy selected representatives
            partial_distances = {str(i): [] for i in range(len(points))}
            for i in range(len(points)):
                if i not in indexes:
                    for other in others:
                        partial_distances[str(i)].append(
                            [dist1(points[i], np.array(other))]
                        )
            partial_distances = dict(
                (k, [np.sum(v)]) for k, v in partial_distances.items()
            )
            index2 = max(partial_distances, key=partial_distances.get)
            indexes.append(int(index2))
            others.append(
                points[int(index2)]
            )  # other points are the farthest from the already selected representatives

        # perform the shrinking according to the parameter alpha
        for i in range(len(others)):
            others[i] = others[i] + alpha * (com - others[i])

        return others


[docs]def form_new_cluster(clusters: dict, u: str, u_cl: str) -> list:
    """
    Form a new cluster from the input ones.

    :param clusters: existing clusters.
    :param u: first cluster.
    :param u_cl: second cluster.
    :return: new cluster obtained by merging the first and second cluster.
    """

    if (np.array(clusters[u]).shape == (2,)) and (
        np.array(clusters[u_cl]).shape == (2,)
    ):
        new_cluster = [clusters[u], clusters[u_cl]]
    elif (np.array(clusters[u]).shape != (2,)) and (
        np.array(clusters[u_cl]).shape == (2,)
    ):
        clusters[u].append(clusters[u_cl])
        new_cluster = clusters[u]
    elif (np.array(clusters[u]).shape == (2,)) and (
        np.array(clusters[u_cl]).shape != (2,)
    ):
        clusters[u_cl].append(clusters[u])
        new_cluster = clusters[u_cl]
    else:
        new_cluster = clusters[u] + clusters[u_cl]

    return new_cluster


[docs]def cure(
    X: np.ndarray,
    k: int,
    c: int = 3,
    alpha: float = 0.1,
    plotting: bool = True,
    preprocessed_data=None,
    partial_index=None,
    n_rep_finalclust=None,
    not_sampled=None,
    not_sampled_ind=None,
):
    """
    CURE algorithm: hierarchical agglomerative clustering using representatives. The parameters which default to
    None are used for the large dataset variation of CURE.

    :param X: input data array.
    :param k: desired number of clusters.
    :param c: number of representatives for each cluster.
    :param alpha: parameter that regulates the shrinking of representative points toward the centroid.
    :param plotting: if True, plots all intermediate steps.
    :param preprocessed_data: if not None, must be of the form (clusters,representatives,matrix_a,X_dist1), which is used to perform a warm start.
    :param partial_index: if not None, it is used as index of the matrix_a, of cluster points and of representatives.
    :param n_rep_finalclust: the final representative points used to classify the not_sampled points.
    :param not_sampled: points not sampled in the initial phase.
    :param not_sampled_ind: indexes of not_sampled points.
    :return, rep, a): returns the clusters dictionary, the dictionary of representatives, the matrix a
    """
    # starting from raw data
    if preprocessed_data is None:
        # building a dataframe storing the x and y coordinates of input data points
        CURE_df, CURE_df_nonan = build_initial_matrices(X, partial_index)

        # initial clusters
        if partial_index is not None:
            clusters = dict(zip(partial_index, X))
        else:
            clusters = {str(i): np.array(p) for i, p in enumerate(X)}

        # build Xdist
        X_dist = dist_mat_gen(CURE_df_nonan)

        # initialize representatives
        if partial_index is not None:
            rep = {partial_index[i]: [X[int(i)]] for i in range(len(X))}
        else:
            rep = {str(i): [p] for i, p in enumerate(X)}

        # just as placeholder for while loop
        heap = [1] * len(X_dist)

        # store minimum distances between clusters for each iteration
        levels = []

    # use precomputed data
    else:

        clusters = preprocessed_data[0]
        rep = preprocessed_data[1]
        CURE_df = preprocessed_data[2]
        X_dist = preprocessed_data[3]
        heap = [1] * len(X_dist)
        levels = []

    # store original index
    if partial_index is not None:
        initial_index = deepcopy(partial_index)

    # while the desired number of clusters has not been reached
    while len(heap) > k:

        # find minimum value of heap queue, which stores clusters according to the distance from
        # their closest cluster

        list_argmin = list(X_dist.apply(lambda x: np.argmin(x)).values)
        list_min = list(X_dist.min(axis=0).values)
        heap = dict(zip(list(X_dist.index), list_min))
        heap = dict(OrderedDict(sorted(heap.items(), key=lambda kv: kv[1])))
        closest = dict(zip(list(X_dist.index), list_argmin))

        # get minimum keys and delete them from heap and closest dictionaries
        u = min(heap, key=heap.get)
        levels.append(heap[u])
        del heap[u]
        # u_cl = str(closest[u])
        u_cl = X_dist.columns[closest[u]]
        del closest[u]

        new_cluster = form_new_cluster(clusters, u, u_cl)

        # delete old clusters
        del clusters[u]
        del clusters[u_cl]

        # set new name
        name = "(" + u + ")" + "-" + "(" + u_cl + ")"
        clusters[name] = new_cluster

        # update representatives
        rep[name] = sel_rep_fast(rep[u] + rep[u_cl], clusters, name, c, alpha)

        # update distance matrix
        X_dist = update_mat_cure(X_dist, u, u_cl, rep, name)

        # delete old representatives
        del rep[u]
        del rep[u_cl]

        dim1 = int(CURE_df.loc[u].notna().sum())
        # update the matrix a with the new cluster
        CURE_df.loc["(" + u + ")" + "-" + "(" + u_cl + ")", :] = CURE_df.loc[u].fillna(
            0
        ) + CURE_df.loc[u_cl].shift(dim1, fill_value=0)
        CURE_df = CURE_df.drop(u, 0)
        CURE_df = CURE_df.drop(u_cl, 0)

        if plotting is True:

            # in the large dataset version of CURE
            if partial_index is not None:

                # only in last step of large dataset version of CURE
                if (
                    (len(heap) == k)
                    and (not_sampled is not None)
                    and (not_sampled_ind is not None)
                ):

                    # take random representative points from the final representatives
                    final_reps = {
                        k: random.sample(v, min(n_rep_finalclust, len(v)))
                        for k, v in rep.items()
                                  }

                    partial_index = point_plot_mod2(
                        X=X,
                        CURE_df=CURE_df,
                        reps=rep[name],
                        level_txt=levels[-1],
                        par_index=partial_index,
                        u=u,
                        u_cl=u_cl,
                        initial_ind=initial_index,
                        last_reps=final_reps,
                        not_sampled=not_sampled,
                        not_sampled_ind=not_sampled_ind,
                        n_rep_fin=n_rep_finalclust,
                    )

                # in the intermediate steps of the large dataset version
                else:
                    partial_index = point_plot_mod2(
                        X=X,
                        CURE_df=CURE_df,
                        reps=rep[name],
                        level_txt=levels[-1],
                        par_index=partial_index,
                        u=u,
                        u_cl=u_cl,
                        initial_ind=initial_index,
                    )
            else:
                point_plot_mod2(X, CURE_df, rep[name], levels[-1])

    return clusters, rep, CURE_df


[docs]def plot_results_cure(clust: dict) -> None:
    """
    Scatter plot of data points, colored according to the cluster they belong to, after performing
    CURE algorithm.

    :param clust: output of CURE algorithm, dictionary of the form cluster_labels+point_indices: coords of points
    """
    fig, ax = plt.subplots(figsize=(14, 6))

    for v in clust.values():
        v = np.array(v)
        if v.ndim > 1:
            ax.scatter(v[:, 0], v[:, 1], s=300)
        else:
            ax.scatter(v[0], v[1], s=300)

    plt.show()


[docs]def dist_mat_gen_cure(reps: dict) -> pd.DataFrame:
    """
    Build distance matrix for CURE algorithm, using the dictionary of representatives.

    :param reps: dictionary of representative points, the only ones used to compute distances
                       between clusters.
    :return: distance matrix as dataframe
    """
    distance_matrix = pd.DataFrame()
    ind = list(reps.keys())
    k = 0
    for i in ind:
        for j in ind[k:]:
            if i != j:

                a = reps[i]
                b = reps[j]

                distance_matrix.loc[i, j] = dist_clust_cure(a, b)
                distance_matrix.loc[j, i] = distance_matrix.loc[i, j]
            else:

                distance_matrix.loc[i, j] = np.inf

        k += 1

    distance_matrix = distance_matrix.fillna(np.inf)

    return distance_matrix


[docs]def cure_sample_part(
    X: np.ndarray,
    k: int,
    c: int = 3,
    alpha: float = 0.3,
    u_min: Optional[int] = None,
    f: float = 0.3,
    d: float = 0.02,
    p: Optional[int] = None,
    q: Optional[int] = None,
    n_rep_finalclust: Optional[int] = None,
    plotting: bool = True,
):
    """
    CURE algorithm variation for large datasets.
    Partition the sample space into p partitions, each of size len(X)/p, then partially cluster each
    partition until the final number of clusters in each partition reduces to n/(pq). Then run a second
    clustering pass on the n/q partial clusters for all the partitions.

    :param X: input data array.
    :param k: desired number of clusters.
    :param c: number of representatives for each cluster.
    :param alpha: parameter that regulates the shrinking of representative points toward the centroid.
    :param u_min: size of the smallest cluster u.
    :param f: percentage of cluster points (0 <= f <= 1) we would like to have in the sample.
    :param d: (0 <= d <= 1) the probability that the sample contains less than f*|u| points of cluster u is less than d.
    :param p: the number of partitions.
    :param q: the number >1 such that each partition reduces to n/(pq) clusters.
    :param n_rep_finalclust: number of representatives to use in the final assignment phase.
    :param plotting: if True, plots all intermediate steps.
    :return, rep, mat_a): returns the clusters dictionary, the dictionary of representatives, the matrix a.
    """
    if ((p is None) and (q is not None)) or ((q is None) and (p is not None)):
        raise ValueError("p and q must be both specified if not None.")

    # choose the parameters suggested by the paper if the user doesnt provide input parameters
    if u_min is None:
        u_min = round(len(X) / k)

    if n_rep_finalclust is None:
        n_rep_finalclust = c

    _, df_nonan = build_initial_matrices(X)

    # this is done to ensure that the algorithm starts even when input params are bad
    while True:
        print("new f: ", f)
        print("new d: ", d)
        n = math.ceil(chernoffBounds(u_min=u_min, f=f, N=len(X), k=k, d=d))

        if n <= len(df_nonan):
            b_sampled = df_nonan.sample(n, random_state=42)
            break

        else:
            if f >= 0.19:
                f = f - 0.1
            else:
                d = d * 2

    b_notsampled = df_nonan.loc[
        [str(i) for i in range(len(df_nonan)) if str(i) not in b_sampled.index], :
    ]

    # find the best p and q according to the paper
    if (p is None) and (q is None):

        def g(x):
            res = (x[1] - 1) / (x[0] * x[1]) + 1 / (x[1] ** 2)
            return res

        results = {}
        for i in range(2, 15):
            for j in range(2, 15):
                results[(i, j)] = g([i, j])
        p, q = max(results, key=results.get)
        print("p: ", p)
        print("q: ", q)

    if (n / (p * q)) < 2 * k:
        print("n/pq is less than 2k, results could be wrong.")

    if k * d >= 1:
        print("k*d is greater or equal to 1, results could be wrong.")

    # form the partitions
    lin_sp = np.linspace(0, n, p + 1, dtype="int")
    # lin_sp
    b_partitions = []
    for num_p in range(p):
        # try:
        b_partitions.append(b_sampled.iloc[lin_sp[num_p]: lin_sp[num_p + 1]])
        # except:
        # b_partitions.append(b_sampled.iloc[lin_sp[num_p]:])

    k_prov = round(n / (p * q))

    # perform clustering on each partition separately
    partial_clust = []
    partial_rep = []
    partial_CURE_df = []

    for i in range(p):
        print("\n")
        print(i)
        clusters, rep, CURE_df = cure(
            b_partitions[i].values,
            k=k_prov,
            c=c,
            alpha=alpha,
            plotting=plotting,
            partial_index=b_partitions[i].index,
        )
        partial_clust.append(clusters)
        partial_rep.append(rep)
        partial_CURE_df.append(CURE_df)

    # merging all data into single components
    # clusters
    clust_tot = {}
    for d in partial_clust:
        clust_tot.update(d)
    # representatives
    rep_tot = {}
    for d in partial_rep:
        rep_tot.update(d)
    # mat CURE_df
    diz = {i: len(b_partitions[i]) for i in range(p)}
    num_freq = Counter(diz.values()).most_common(1)[0][0]

    bad_ind = [k for k, v in diz.items() if v != num_freq]

    for ind in bad_ind:
        partial_CURE_df[ind]["{0}x".format(diz[ind])] = [np.nan] * k_prov
        partial_CURE_df[ind]["{0}y".format(diz[ind])] = [np.nan] * k_prov

    CURE_df_tot = partial_CURE_df[0].append(partial_CURE_df[1])
    for i in range(1, len(partial_CURE_df) - 1):
        CURE_df_tot = CURE_df_tot.append(partial_CURE_df[i + 1])

    # mat Xdist
    X_dist_tot = dist_mat_gen_cure(rep_tot)

    # final_clustering
    prep_data = [clust_tot, rep_tot, CURE_df_tot, X_dist_tot]
    clusters, rep, CURE_df = cure(
        b_sampled.values,
        k=k,
        c=c,
        alpha=alpha,
        preprocessed_data=prep_data,
        partial_index=b_sampled.index,
        n_rep_finalclust=n_rep_finalclust,
        not_sampled=b_notsampled.values,
        plotting=plotting,
        not_sampled_ind=b_notsampled.index,
    )

    return clusters, rep, CURE_df


[docs]def demo_parameters():
    """Four plots showing the effects on the sample size of various parameters."""
    plt.figure(figsize=(12, 10))
    plt.suptitle("Effects on sample size from different parameters")

    def compute_res(u_size, f, N, k, d):
        res = k * (
                f * N
                + N / u_size * np.log(1 / d)
                + N
                / u_size
                * np.sqrt(np.log(1 / d) ** 2 + 2 * f * u_size * np.log(1 / d))
        )
        return res

    ax0 = plt.subplot(2, 2, 1)

    u_size = 6000
    f = 0.20
    N = 20000
    k = 4
    d = np.linspace(0.0000001, 1, 100)
    res = compute_res(u_size, f, N, k, d)
    ax0.set_title("u_min: {0}, f:{1}, k:{2}".format(u_size, f, k))

    plt.axhline(N, color="r")
    plt.plot(d, res)
    plt.xlabel("d")
    plt.ylabel("sample size")

    ax1 = plt.subplot(2, 2, 2)

    u_size = 3000
    f = 0.2
    N = 20000
    d = 0.1
    k = list(range(1, 13))
    ax1.set_title("u_min: {0}, f:{1}, d:{2}".format(u_size, f, d))
    res = [
        compute_res(u_size, f, N, k[i], d)
        for i in range(len(k))
    ]
    plt.axhline(N, color="r")
    plt.plot(k, res)
    plt.xlabel("k")

    ax2 = plt.subplot(2, 2, 3)

    u_size = 5000
    f = np.linspace(0.00001, 1, 100)
    N = 20000
    d = 0.1
    k = 4
    ax2.set_title("u_min: {0}, d:{1}, k:{2}".format(u_size, d, k))
    res = compute_res(u_size, f, N, k, d)
    plt.axhline(N, color="r")
    plt.plot(f, res)
    plt.xlabel("f")
    plt.ylabel("sample size")

    ax3 = plt.subplot(2, 2, 4)

    u_size = np.linspace(200, 10000, 30)
    f = 0.2
    N = 20000
    d = 0.1
    k = 4
    ax3.set_title("f: {0}, d:{1}, k:{2}".format(f, d, k))
    res = compute_res(u_size, f, N, k, d)
    plt.axhline(N, color="r")
    plt.plot(u_size, res)
    plt.xlabel("u_min")

    plt.show()