Source code for pyiomica.frequencySubjectMatch
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
from scipy.stats.stats import pearsonr
from collections import Counter
from copy import deepcopy
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
[docs]def bootstrapGeneral(df, N, shuffling = True):
'''To generate bootstrap samples
Parameters:
df: pandas dataframe
the source dataframe using to generate bootstrap samples
N: integer
the size of bootstrap samples
shuffling : boolean
shuffle the data or not, The default is True.
Returns:
bootstrapDF: pandas dataframe
the bootstrap samples
'''
bootstrapDF = pd.DataFrame()
df.reset_index(drop=True, inplace=True)
if shuffling:
for column in df:
bootstrapDF[column] = df[column].iloc[np.random.randint(df.shape[0], size=N)].values
else:
bootstrapDF = df.iloc[np.random.randint(df.shape[0], size=N)]
return bootstrapDF
[docs]def calculateLinksBetweenSubjectsByDistance(df1,df2,cutoff):
'''To calculate the linked time series/Genes from two dataframes base on the Euclidean distance
Parameters:
df1: pandas dataframe
the first time series from df1
df2: pandas dataframe
the second time series from df2
cutoff: float
if the distance between two time series less than cutoff, the two time series is linked
time series
Returns:
numlinkedGenes: integer/float
number of linked time series
commonGenes: integer/float
number of common time series in df1 and df2
linkedGenes: list of string
the ids/names of linked time series
'''
idx = df1.index.intersection(df2.index)
numlinkedGenes = 0
linkedGenes = []
commonGenes = len(idx)
if commonGenes != 0:
for r in idx:
s1 = df1.loc[r]
s2 = df2.loc[r]
d = np.linalg.norm(s1-s2)
if d < cutoff:
numlinkedGenes += 1
linkedGenes.append(r)
return numlinkedGenes, commonGenes,linkedGenes
[docs]def calculateLinksBetweenSubjectsByCorrelation(df1,df2,cutoff):
'''To calculate the linked time series/Genes from two dataframes base on the pearson correlation
Parameters:
df1: pandas dataframe
the first time series from df1
df2: pandas dataframe
the second time series from df2
cutoff: float
if the pearson correlation between two time series less than cutoff, the two time series is linked
time series
Returns:
numlinkedGenes: integer/float
number of linked time series
commonGenes: integer/float
number of common time series in df1 and df2
linkedGenes: list of string
the ids/names of linked time series
'''
idx = df1.index.intersection(df2.index)
numlinkedGenes = 0
linkedGenes = []
commonGenes = len(idx)
if commonGenes != 0:
for r in idx:
s1 = df1.loc[r]
s2 = df2.loc[r]
d = pearsonr(s1, s2)[0]
if d > cutoff:
numlinkedGenes += 1
linkedGenes.append(r)
return numlinkedGenes, commonGenes,linkedGenes
[docs]def getCommunityStructure(cs):
'''
To change community structure from {node1:community1, node2:community2,...} to
{community1:[node1, node2,...], community2:[node3, node4,...]}
Parameters:
cs: dictionary
the community structure as {node1:community1, node2:community2,...}
Returns:
community_structure: dictionary
the community structure as {community1:[node1, node2,...], community2:[node3, node4,...]}
'''
commu_list = list(cs.values())
community_structure = {}
for commu in commu_list:
templist = []
for k,v in cs.items():
if int(v) == int(commu):
templist.append(k)
community_structure[commu] = deepcopy(templist)
return community_structure
[docs]def getCommunityGenesDict(community_structure,genelist,endwithString):
'''To get gene IDs list of each community within selected individuals' category
Parameters:
community_structure: dictionary
the community structure as {community1:[node1, node2,...], community2:[node3, node4,...]}
genelist: dictionary
the gene list of each individuals, the key is the id of individual
endwithString: list of string
the selected individuals categories, which attached to the end of the individual ids
Returns:
community_genes_dict: dictionary
the genes list of each community
'''
community_genes_dict = {}
for key in community_structure.keys():
#for k in [1]:
genes_list = []
subjectlist = community_structure[key]
for k in genelist.keys():
if (k[0].endswith(endwithString) and k[1].endswith(endwithString)
and k[0] in subjectlist and k[1] in subjectlist):
#if sub.endswith('Prediabetic'):
genes_list.append(set(genelist[k]))
#copy_genes_list = deepcopy(genes_list)
if len(genes_list) > 0:
genes_set = list(set.intersection(*genes_list))
if len(genes_set):
community_genes_dict[key] = genes_set
community_genes_dict = {k: v for k, v in community_genes_dict.items() if len(v)}
return community_genes_dict
[docs]def splitGenes(community_gene_dict):
'''Split gene ids, to seperate the genes name from attached labels
Parameters:
community_gene_dict: dictionary
the genes ids list of each community
Returns:
new_dict: dictionary
the gene names list of each community
'''
new_dict = {}
for key in community_gene_dict:
fl = []
l = community_gene_dict[key]
for name in l:
fl.append(name.split('+')[0])
new_dict[key] = fl
return new_dict
[docs]def getCommunityTopGenesByNumber(community_structure,genelist,endwithString,numberOfTopGenes=500):
'''To get the top ranking genes of each community
Parameters:
community_structure: dictionary
the community structure as {community1:[node1, node2,...], community2:[node3, node4,...]}
genelist: dictionary
the genes list of each community
endwithString: list of string
the selected individuals categories, which attached to the end of the individual ids
numberOfTopGenes: integer, optional
the number of top ranking genes. The default is 500.
Returns:
community_genes_dict: dictionary
the top ranking genes of each community
'''
community_genes_dict = {}
for key in community_structure.keys():
#for k in [1]:
genes_list = []
subjectlist = community_structure[key]
for k in genelist.keys():
if (k[0].endswith(endwithString) and k[1].endswith(endwithString)
and k[0] in subjectlist and k[1] in subjectlist):
#if sub.endswith('Prediabetic'):
genes_list.extend(genelist[k])
counted = Counter(genes_list)
ordered = [value for value, count in counted.most_common()]
if len(ordered) >= numberOfTopGenes:
community_genes_dict[key] = ordered[0:numberOfTopGenes]
else:
print("number of genes less than choose number of top genes")
community_genes_dict[key] = ordered
community_genes_dict = {k: v for k, v in community_genes_dict.items() if len(v)}
return community_genes_dict
[docs]def getCommunityTopGenesByFrequencyRanking(community_structure,genelist,endwithString,frequencyPercentage=50):
'''To get the top frequency genes of each community
Parameters:
community_structure: dictionary
the community structure as {community1:[node1, node2,...], community2:[node3, node4,...]}
genelist: dictionary
the genes list of each community
endwithString: list of string
the selected individuals categories, which attached to the end of the individual ids
frequencyPercentage: float, optional
the top percentage frequency of choosed genes, The default is 50.
Returns:
community_genes_dict: dictionary
the top percentage frequency genes of each community
'''
community_genes_dict = {}
for key in community_structure.keys():
#for k in [1]:
genes_list = []
subjectlist = community_structure[key]
for k in genelist.keys():
if (k[0].endswith(endwithString) and k[1].endswith(endwithString)
and k[0] in subjectlist and k[1] in subjectlist):
#if sub.endswith('Prediabetic'):
genes_list.extend(genelist[k])
counted = Counter(genes_list)
top_ranking_value = round(max(counted.values()) * (1 - frequencyPercentage/100))
genes = [k for k, v in counted.items() if int(v) >= top_ranking_value]
community_genes_dict[key] = genes
community_genes_dict = {k: v for k, v in community_genes_dict.items() if len(v)}
return community_genes_dict
[docs]def optimizeK(df,rangeK,saveFig=False,**kargs):
'''To optimize the k value of k-mean cluster
Parameters:
df: pandas dataframe
the data source to do k-mean cluster
rangeK: python range, e.g. rangeK = range(0,10)
the K value range
saveFig: boolean, optional
save figure or not. The default is False.
\*\*kargs: figure name
if saveFig is true, the \*\*kargs is the figure name
Returns:
optimizek:integer
the optimized K value
'''
sil = []
elbow = []
K = rangeK
for k in K:
km = KMeans(n_clusters=k)
km = km.fit(df)
labels = km.labels_
elbow.append(km.inertia_)
sil.append(silhouette_score(df, labels, metric = 'euclidean'))
maxindex = sil.index(max(sil))
optimizek = K[maxindex]
if saveFig:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 6)) # create figure & 1 axis
ax1.plot(K, sil, 'bx-')
ax1.set_xlabel('k')
ax1.set_ylabel('Silhouette_score')
ax1.set_title('Silhouette Method For Optimal k')
ax2.plot(K, elbow, 'rx-')
ax2.set_xlabel('k')
ax2.set_ylabel('Sum_of_squared_distances')
ax2.set_title('Elbow Method For Optimal k')
plt.tight_layout()
fig.savefig(kargs['figname']) # save the figure to file
plt.close(fig)
return optimizek