• G. Mias Lab »
  • Source code for pyiomica.clusteringFunctions

    '''Clustering-related functions'''
    
    import scipy.signal
    import scipy.spatial.distance
    from scipy.interpolate import UnivariateSpline
    
    from sklearn.cluster import AgglomerativeClustering, KMeans, MiniBatchKMeans
    import sklearn.metrics
    
    from .globalVariables import *
    
    from . import (coreFunctions,
                   visibilityGraphAuxiliaryFunctions,
                   visibilityGraphCommunityDetection,
                   utilityFunctions)
    
    
    
    [docs]def getEstimatedNumberOfClusters(data, cluster_num_min, cluster_num_max, trials_to_do, numberOfAvailableCPUs=4, plotID=None, printScores=False): """ Get estimated number of clusters using ARI with KMeans Parameters: data: 2d numpy.array Data to analyze cluster_num_min: int Minimum possible number of clusters cluster_num_max: int Maximum possible number of clusters trials_to_do: int Number of trials to do in ARI function numberOfAvailableCPUs: int, Default 4 Number of processes to run in parallel plotID: str, Default None Label for the plot of peaks printScores: boolean, Default False Whether to print all scores Returns: tuple Largest peak, other possible peaks. Usage: n_clusters = getEstimatedNumberOfClusters(data, 1, 20, 25) """ def getPeakPosition(scores, makePlot=False, plotID=None): print() spline = UnivariateSpline(scores.T[0], scores.T[1]) spline.set_smoothing_factor(0.005) xs = np.linspace(scores.T[0][0], scores.T[0][-1], 1000) data = np.vstack((xs, spline(xs))).T data_all = data.copy() data = data[data.T[0] > 4.] peaks = scipy.signal.find_peaks(data.T[1])[0] if len(peaks) == 0: selected_peak = 5 print('WARNING: no peak found') else: selected_peak = np.round(data.T[0][peaks[np.argmax(data.T[1][peaks])]],0).astype(int) selected_peak_value = scores.T[1][np.argwhere(scores.T[0] == selected_peak)[0][0]] peaks = np.round(data.T[0][peaks],0).astype(int) if len(peaks) != 0 else peaks #if makePlot: # makePlotOfPeak(data_all, scores, selected_peak, selected_peak_value, plotID) print(selected_peak, peaks) return selected_peak, peaks print('Testing data clustering in a range of %s-%s clusters' % (cluster_num_min,cluster_num_max)) scores = utilityFunctions.runCPUs(numberOfAvailableCPUs, runForClusterNum, [(cluster_num, copy.deepcopy(data), trials_to_do) for cluster_num in range(cluster_num_min, cluster_num_max + 1)]) if printScores: print(scores) return getPeakPosition(scores, makePlot=True, plotID=plotID)[0]
    [docs]def getNClustersFromLinkageElbow(Y): """ Get optimal number clusters from linkage. A point of the highest accelleration of the fusion coefficient of the given linkage. Parameters: Y: 2d numpy.array Linkage matrix Returns: int Optimal number of clusters Usage: n_clusters = getNClustersFromLinkageElbow(Y) """ return np.diff(np.array([[nc, Y[-nc + 1][2]] for nc in range(2,min(50,len(Y)))]).T[1], 2).argmax() + 1 if len(Y) >= 5 else 1
    [docs]def getNClustersFromLinkageSilhouette(Y, data, metric): """Determine the optimal number of cluster in data maximizing the Silhouette score. Parameters: Y: 2d numpy.array Linkage matrix data: 2d numpy.array Data to analyze metric: str or function Distance measure Returns: int Optimal number of clusters Usage: n_clusters = getNClustersFromLinkageSilhouette(Y, data, 'euclidean') """ max_score = 0 n_clusters = 1 distmatrix = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(data, metric=metric)) for temp_n_clusters in range(2,10): print(temp_n_clusters, end='a, ', flush=True) temp_clusters = scipy.cluster.hierarchy.fcluster(Y, t=temp_n_clusters, criterion='maxclust') print(temp_n_clusters, end='b, ', flush=True) temp_score = sklearn.metrics.silhouette_score(distmatrix, temp_clusters, metric=metric) if temp_score>max_score: max_score = temp_score n_clusters = temp_n_clusters return n_clusters - 1
    [docs]def runForClusterNum(arguments): """Calculate Adjusted Rand Index of the data for a range of cluster numbers. Parameters: arguments: tuple A tuple of three parameters int the form (cluster_num, data_array, trials_to_do), where cluster_num: int Maximum number of clusters data_array: 2d numpy.array Data to test trials_to_do: int Number of trials for each cluster number Returns: 1d numpy.array Numpy array Usage: instPool = multiprocessing.Pool(processes = NumberOfAvailableCPUs) scores = instPool.map(runForClusterNum, [(cluster_num, copy.deepcopy(data), trials_to_do) for cluster_num in range(cluster_num_min, cluster_num_max + 1)]) instPool.close() instPool.join() """ np.random.seed() cluster_num, data_array, trials_to_do = arguments print(cluster_num, end=', ', flush=True) labels = [KMeans(n_clusters=cluster_num).fit(data_array).labels_ for i in range(trials_to_do)] agreement_matrix = np.zeros((trials_to_do,trials_to_do)) for i in range(trials_to_do): for j in range(trials_to_do): agreement_matrix[i, j] = sklearn.metrics.adjusted_rand_score(labels[i], labels[j]) if agreement_matrix[j, i] == 0 else agreement_matrix[j, i] selected_data = agreement_matrix[np.triu_indices(agreement_matrix.shape[0],1)] return np.array((cluster_num, np.mean(selected_data), np.std(selected_data)))
    [docs]def getGroupingIndex(data, n_groups=None, method='weighted', metric='correlation', significance='Elbow'): """Cluster data into N groups, if N is provided, else determine N return: linkage matrix, cluster labels, possible cluster labels. Parameters: data: 2d numpy.array Data to analyze n_groups: int, Default None Number of groups to split data into method: str, Default 'weighted' Linkage calculation method metric: str, Default 'correlation' Distance measure significance: str, Default 'Elbow' Method for determining optimal number of groups and subgroups Returns: tuple Linkage matrix, cluster index, possible groups Usage: x, y, z = getGroupingIndex(data, method='weighted', metric='correlation', significance='Elbow') """ Y = hierarchy.linkage(data, method=method, metric=metric, optimal_ordering=False) if n_groups == None: if significance=='Elbow': n_groups = getNClustersFromLinkageElbow(Y) elif significance=='Silhouette': n_groups = getNClustersFromLinkageSilhouette(Y, data, metric) print('n_groups:', n_groups) labelsClusterIndex = scipy.cluster.hierarchy.fcluster(Y, t=n_groups, criterion='maxclust') groups = np.sort(np.unique(labelsClusterIndex)) print([np.sum(labelsClusterIndex == group) for group in groups]) return Y, labelsClusterIndex, groups
    [docs]def makeClusteringObject(df_data, df_data_autocorr, method='weighted', metric='correlation', significance='Elbow'): """Make a clustering Groups-Subgroups dictionary object. Parameters: df_data: pandas.DataFrame Data to analyze in DataFrame format df_data_autocorr: pandas.DataFrame Autocorrelations or periodograms in DataFrame format method: str, Default 'weighted' Linkage calculation method metric: str, Default 'correlation' Distance measure significance: str, Default 'Elbow' Method for determining optimal number of groups and subgroups Returns: dictionary Clustering object Usage: myObj = makeClusteringObject(df_data, df_data_autocorr, significance='Elbow') """ def getSubgroups(df_data, metric, significance): Y = hierarchy.linkage(df_data.values, method=method, metric=metric, optimal_ordering=True) leaves = hierarchy.dendrogram(Y, no_plot=True)['leaves'] if significance=='Elbow': n_clusters = getNClustersFromLinkageElbow(Y) elif significance=='Silhouette': n_clusters = getNClustersFromLinkageSilhouette(Y, df_data.values, metric) print('n_subgroups:', n_clusters) clusters = scipy.cluster.hierarchy.fcluster(Y, t=n_clusters, criterion='maxclust')[leaves] return {cluster:df_data.index[leaves][clusters==cluster] for cluster in np.unique(clusters)}, Y ClusteringObject = {} try: grouping = getGroupingIndex(df_data_autocorr.values, method=method, metric=metric, significance=significance) except Exception as exception: print(exception) print('Returning None') return None ClusteringObject['linkage'], labelsClusterIndex, groups = grouping for group in groups: signals = df_data.index[labelsClusterIndex==group] ClusteringObject[group], ClusteringObject[group]['linkage'] = ({1: signals}, None) if len(signals)==1 else getSubgroups(df_data.loc[signals], coreFunctions.metricCommonEuclidean, significance) for subgroup in sorted([item for item in list(ClusteringObject[group].keys()) if not item=='linkage']): ClusteringObject[group][subgroup] = {'order':[np.where([temp==signal for temp in df_data.index.values])[0][0] for signal in list(ClusteringObject[group][subgroup])], 'data':df_data.loc[ClusteringObject[group][subgroup]], 'dataAutocorr':df_data_autocorr.loc[ClusteringObject[group][subgroup]]} return ClusteringObject
    [docs]def exportClusteringObject(ClusteringObject, saveDir, dataName, includeData=True, includeAutocorr=True): """Export a clustering Groups-Subgroups dictionary object to a SpreadSheet. Linkage data is not exported. Parameters: ClusteringObject: dictionary Clustering object saveDir: str Path of directories to save the object to dataName: str Label to include in the file name includeData: boolean, Default True Export data includeAutocorr: boolean, Default True Export autocorrelations of data Returns: str File name of the exported clustering object Usage: exportClusteringObject(myObj, '/dir1', 'myObj') """ if not os.path.exists(saveDir): os.makedirs(saveDir) fileName = saveDir + dataName + '_GroupsSubgroups.xlsx' writer = pd.ExcelWriter(fileName) for group in sorted([item for item in list(ClusteringObject.keys()) if not item=='linkage']): for subgroup in sorted([item for item in list(ClusteringObject[group].keys()) if not item=='linkage']): df_data = ClusteringObject[group][subgroup]['data'].iloc[::-1] df_dataAutocorr = ClusteringObject[group][subgroup]['dataAutocorr'].iloc[::-1] if includeData==True and includeAutocorr==True: df = pd.concat((df_data,df_dataAutocorr), sort = False, axis=1) elif includeData==True and includeAutocorr==False: df = df_data elif includeData==False and includeAutocorr==True: df = df_dataAutocorr else: df = pd.DataFrame(index=df_data.index) df.index.name = 'Index' df.to_excel(writer, 'G%sS%s'%(group, subgroup)) writer.save() print('Saved clustering object to:', fileName) return fileName
    [docs]def getCommunitiesOfTimeSeries(data, times, minNumberOfCommunities=2, horizontal=False, method='WDPVG', direction='left',weight='distance'): '''Get communities of time series Parameters: data: 1d numpy.array Data array times: 1d numpy.array Times corresponding to data points minNumberOfCommunities: int, Default 2 Number of communities to find depends on the number of splits. This parameter is ignored in methods that automatically estimate optimal number of communities. horizontal: boolean, Default False Whether to use horizontal of normal visibility graph method: str, Default 'betweenness_centrality' Name of the method to use: 'Girvan_Newman': edge betweenness centrality based approach 'betweenness_centrality': reflected graph node betweenness centrality based approach 'WDPVG': weighted dual perspective visibility graph method (also set weight variable) direction:str, default 'left' The direction that nodes aggregate to communities: None: no specific direction, e.g. both sides. 'left': nodes can only aggregate to the left side hubs, e.g. early hubs 'right': nodes can only aggregate to the right side hubs, e.g. later hubs weight: str, Default 'distance' Type of weight for method='WDPVG': None: no weighted 'time': weight = abs(times[i] - times[j]) 'tan': weight = abs((data[i] - data[j])/(times[i] - times[j])) + 10**(-8) 'distance': weight = A[i, j] = A[j, i] = ((data[i] - data[j])**2 + (times[i] - times[j])**2)**0.5 Returns: (list, graph) List of communities and a networkx graph Usage: res = getCommunitiesOfTimeSeries(data, times) ''' if method=='betweenness_centrality': if horizontal: graph_nx = nx.from_numpy_matrix(visibilityGraphAuxiliaryFunctions.getAdjacencyMatrixOfHVG(data)) graph_nx_inv = nx.from_numpy_matrix(visibilityGraphAuxiliaryFunctions.getAdjacencyMatrixOfHVG(-data)) else: graph_nx = nx.from_numpy_matrix(visibilityGraphAuxiliaryFunctions.getAdjacencyMatrixOfNVG(data, times)) graph_nx_inv = nx.from_numpy_matrix(visibilityGraphAuxiliaryFunctions.getAdjacencyMatrixOfNVG(-data, times)) def find_and_remove_node(graph_nx): bc = nx.betweenness_centrality(graph_nx) node_to_remove = list(bc.keys())[np.argmax(list(bc.values()))] graph_nx.remove_node(node_to_remove) return graph_nx, node_to_remove list_of_nodes = [] for i in range(minNumberOfCommunities-1): graph_nx_inv, node = find_and_remove_node(graph_nx_inv) list_of_nodes.append(node) if not 0 in list_of_nodes: list_of_nodes.append(0) list_of_nodes.append(list(graph_nx.nodes)[-1] + 1) list_of_nodes.sort() communities = [list(range(list_of_nodes[i],list_of_nodes[i + 1])) for i in range(len(list_of_nodes) - 1)] elif method=='Girvan_Newman': if horizontal: graph_nx = nx.from_numpy_matrix(visibilityGraphAuxiliaryFunctions.getAdjacencyMatrixOfHVG(data)) else: graph_nx = nx.from_numpy_matrix(visibilityGraphAuxiliaryFunctions.getAdjacencyMatrixOfNVG(data, times)) generator_of_communities = nx.algorithms.community.centrality.girvan_newman(graph_nx) for i in range(minNumberOfCommunities-1): communities_for_level = next(generator_of_communities) communities = list(sorted(c) for c in communities_for_level) elif method=='WDPVG': if horizontal: graph_nx = visibilityGraphCommunityDetection.createVisibilityGraph(data, times, "dual_horizontal", weight=weight)[0] else: graph_nx = visibilityGraphCommunityDetection.createVisibilityGraph(data, times, "dual_natural", weight=weight)[0] communities = visibilityGraphCommunityDetection.communityDetectByPathLength(graph_nx, direction=direction, cutoff='auto') else: print('Unknown method: %s'%(method)) return None return communities, graph_nx