G. Mias Lab »
Source code for pyiomica.frequencySubjectMatch

# -*- coding: utf-8 -*-

import pandas as pd
import numpy as np
from scipy.stats.stats import pearsonr
from collections import Counter
from copy import deepcopy

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

import matplotlib.pyplot as plt


[docs]def bootstrapGeneral(df, N, shuffling = True):
    '''To generate bootstrap samples
    
    Parameters:
        df: pandas dataframe
            the source dataframe using to generate bootstrap samples
        N: integer
            the size of bootstrap samples
        shuffling : boolean
            shuffle the data or not, The default is True.
    
    Returns:
        bootstrapDF: pandas dataframe
            the bootstrap samples
    '''


    bootstrapDF = pd.DataFrame()
    df.reset_index(drop=True, inplace=True)
    if shuffling:
        for column in df:
            bootstrapDF[column] = df[column].iloc[np.random.randint(df.shape[0], size=N)].values
            
    else:
        bootstrapDF = df.iloc[np.random.randint(df.shape[0], size=N)]
    
    return bootstrapDF

[docs]def calculateLinksBetweenSubjectsByDistance(df1,df2,cutoff):
    '''To calculate the linked time series/Genes from two dataframes base on the Euclidean distance
    
    Parameters:
        df1: pandas dataframe
            the first time series from df1
        df2: pandas dataframe
            the second time series from df2
        cutoff: float
            if the distance between two time series less than cutoff, the two time series is linked
            time series
    
    Returns:
        numlinkedGenes: integer/float
            number of linked time series
        commonGenes: integer/float
            number of common time series in df1 and df2
        linkedGenes: list of string
            the ids/names of linked time series
    '''
    
    idx = df1.index.intersection(df2.index)
    numlinkedGenes = 0
    linkedGenes = []
    commonGenes = len(idx)
    if commonGenes != 0:
        for r in idx:
            s1 = df1.loc[r]
            s2 = df2.loc[r]
            
            d = np.linalg.norm(s1-s2)
            if d < cutoff:
                numlinkedGenes += 1
                linkedGenes.append(r)
    return numlinkedGenes, commonGenes,linkedGenes

[docs]def calculateLinksBetweenSubjectsByCorrelation(df1,df2,cutoff):
    '''To calculate the linked time series/Genes from two dataframes base on the pearson correlation
    
    Parameters:
        df1: pandas dataframe
            the first time series from df1
        df2: pandas dataframe
            the second time series from df2
        cutoff: float
            if the pearson correlation between two time series less than cutoff, the two time series is linked
            time series
    
    Returns:
        numlinkedGenes: integer/float
            number of linked time series
        commonGenes: integer/float
            number of common time series in df1 and df2
        linkedGenes: list of string
            the ids/names of linked time series
    
    '''
    
    idx = df1.index.intersection(df2.index)
    numlinkedGenes = 0
    linkedGenes = []
    commonGenes = len(idx)
    if commonGenes != 0:
        for r in idx:
            s1 = df1.loc[r]
            s2 = df2.loc[r]
            
            d = pearsonr(s1, s2)[0]
            if d > cutoff:
                numlinkedGenes += 1
                linkedGenes.append(r)
    return numlinkedGenes, commonGenes,linkedGenes

[docs]def getCommunityStructure(cs):
    '''
    To change community structure from {node1:community1, node2:community2,...} to
    {community1:[node1, node2,...], community2:[node3, node4,...]}

    Parameters:
        cs: dictionary
            the community structure as {node1:community1, node2:community2,...}
        
    Returns:
        community_structure: dictionary
            the community structure as {community1:[node1, node2,...], community2:[node3, node4,...]}
    '''
    commu_list = list(cs.values())
    community_structure = {}
    for commu in commu_list:
        templist = []
        for k,v in cs.items():
            if int(v) == int(commu):
                templist.append(k)
        community_structure[commu] = deepcopy(templist)
    return community_structure

[docs]def getCommunityGenesDict(community_structure,genelist,endwithString):
    '''To get gene IDs list of each community within selected individuals' category 

    Parameters:
        community_structure: dictionary
                the community structure as {community1:[node1, node2,...], community2:[node3, node4,...]}
        genelist: dictionary
            the gene list of each individuals, the key is the id of individual 
        endwithString: list of string
           the selected individuals categories, which attached to the end of the individual ids 
    
    Returns:
        community_genes_dict: dictionary
            the genes list of each community
    
    '''

    community_genes_dict = {}
    for key in community_structure.keys():
    #for k in [1]:    
        genes_list = []
        subjectlist = community_structure[key]
        for k in genelist.keys():
            if (k[0].endswith(endwithString) and k[1].endswith(endwithString)
                and k[0] in subjectlist and k[1] in subjectlist):
            #if sub.endswith('Prediabetic'):
                genes_list.append(set(genelist[k]))
            
        #copy_genes_list = deepcopy(genes_list)
        
        if len(genes_list) > 0:
            genes_set = list(set.intersection(*genes_list))
            if len(genes_set):
                community_genes_dict[key] = genes_set
    community_genes_dict = {k: v for k, v in community_genes_dict.items() if len(v)}
    return community_genes_dict

[docs]def splitGenes(community_gene_dict):
    '''Split gene ids, to seperate the genes name from attached labels

    Parameters:
        community_gene_dict: dictionary
            the genes ids list of each community
    
    Returns:
        new_dict: dictionary
            the gene names list of each community
    '''
    new_dict = {}
    for key in community_gene_dict:
        fl = []
        l = community_gene_dict[key]
        for name in l:
            fl.append(name.split('+')[0])
        new_dict[key] = fl
        
    return new_dict

[docs]def getCommunityTopGenesByNumber(community_structure,genelist,endwithString,numberOfTopGenes=500):
    '''To get the top ranking genes of each community

    Parameters:
        community_structure: dictionary
            the community structure as {community1:[node1, node2,...], community2:[node3, node4,...]}
        genelist: dictionary
            the genes list of each community 
        endwithString: list of string
            the selected individuals categories, which attached to the end of the individual ids 
        numberOfTopGenes: integer, optional
            the number of top ranking genes. The default is 500.
    
    Returns:
        community_genes_dict: dictionary
            the top ranking genes of each community
    '''

    community_genes_dict = {}
    for key in community_structure.keys():
    #for k in [1]:    
        genes_list = []
        subjectlist = community_structure[key]
        for k in genelist.keys():
            if (k[0].endswith(endwithString) and k[1].endswith(endwithString)
                and k[0] in subjectlist and k[1] in subjectlist):
            #if sub.endswith('Prediabetic'):
                genes_list.extend(genelist[k])
            
        counted = Counter(genes_list)
        ordered = [value for value, count in counted.most_common()]
        
        if len(ordered) >= numberOfTopGenes:
            community_genes_dict[key] = ordered[0:numberOfTopGenes]
        else:
            print("number of genes less than choose number of top genes")
            community_genes_dict[key] = ordered
            
    community_genes_dict = {k: v for k, v in community_genes_dict.items() if len(v)}
            
    return community_genes_dict

[docs]def getCommunityTopGenesByFrequencyRanking(community_structure,genelist,endwithString,frequencyPercentage=50):
    '''To get the top frequency genes of each community
    
    Parameters:
        community_structure: dictionary
            the community structure as {community1:[node1, node2,...], community2:[node3, node4,...]}
        genelist: dictionary
            the genes list of each community 
        endwithString: list of string
            the selected individuals categories, which attached to the end of the individual ids 
        frequencyPercentage: float, optional
            the top percentage frequency of choosed genes, The default is 50.
    
    Returns:
        community_genes_dict: dictionary
            the top percentage frequency genes of each community
    '''

    community_genes_dict = {}
    for key in community_structure.keys():
    #for k in [1]:    
        genes_list = []
        subjectlist = community_structure[key]
        for k in genelist.keys():
            if (k[0].endswith(endwithString) and k[1].endswith(endwithString)
                and k[0] in subjectlist and k[1] in subjectlist):
            #if sub.endswith('Prediabetic'):
                genes_list.extend(genelist[k])
            
        counted = Counter(genes_list)
        top_ranking_value = round(max(counted.values()) * (1 - frequencyPercentage/100))
        genes = [k for k, v in counted.items() if int(v) >= top_ranking_value]

        community_genes_dict[key] = genes
           
    community_genes_dict = {k: v for k, v in community_genes_dict.items() if len(v)}
            
    return community_genes_dict

[docs]def optimizeK(df,rangeK,saveFig=False,**kargs):
    '''To optimize the k value of k-mean cluster

    Parameters:
        df: pandas dataframe
            the data source to do k-mean cluster
        rangeK: python range, e.g. rangeK = range(0,10)
            the K value range
        saveFig: boolean, optional
            save figure or not. The default is False.
        \*\*kargs: figure name
            if saveFig is true, the \*\*kargs is the figure name 
    
    Returns:
        optimizek:integer
            the optimized K value
    
    '''
    
    sil = []
    elbow = []
    K = rangeK
    for k in K:
        km = KMeans(n_clusters=k)
        km = km.fit(df)
        labels = km.labels_
        elbow.append(km.inertia_)
        sil.append(silhouette_score(df, labels, metric = 'euclidean'))
    
    maxindex = sil.index(max(sil))
    optimizek = K[maxindex]
    
    if saveFig:
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 6)) # create figure & 1 axis
        ax1.plot(K, sil, 'bx-')
        ax1.set_xlabel('k')
        ax1.set_ylabel('Silhouette_score')
        ax1.set_title('Silhouette Method For Optimal k')
        
        ax2.plot(K, elbow, 'rx-')
        ax2.set_xlabel('k')
        ax2.set_ylabel('Sum_of_squared_distances')
        ax2.set_title('Elbow Method For Optimal k')
        plt.tight_layout()
        fig.savefig(kargs['figname'])   # save the figure to file
        plt.close(fig) 
    
    return optimizek