G. Mias Lab »
Source code for pyiomica.enrichmentAnalyses

'''Annotations and Enumerations'''


import pymysql
import datetime
import urllib.request
import requests

from .globalVariables import *

from . import utilityFunctions
from . import dataStorage


[docs]def internalAnalysisFunction(data, multiCorr, MultipleList,  OutputID, InputID, Species, totalMembers,
                            pValueCutoff, ReportFilterFunction, ReportFilter, TestFunction, HypothesisFunction, FilterSignificant,
                            AssignmentForwardDictionary, AssignmentReverseDictionary, prefix, infoDict):

    """Analysis for Multi-Omics or Single-Omics input list
    The function is used internally and not intended to be used directly by user.
    
    Usage:
        Intended for internal use
    """
    
    listData = data[list(data.keys())[0]]

    #If input data was a list of genes, convert it to pairs with label "Generic"
    if not type(listData[0]) is list:
        listData = [[item, 'Unknown'] for item in listData]
    else:
        if len(listData[0])==1:
            listData = [[item[0], 'Unknown'] for item in listData]

    #Get IDs for each gene
    dataForGeneTranslation = [item[0] if type(item) is list else item for item in listData]
    IDs = GeneTranslation(dataForGeneTranslation, OutputID, ConstantGeneDictionary, InputID = InputID,  Species = Species)[OutputID]

    #Remove Missing IDs
    [item.remove('Missing') if 'Missing' in item else None for item in IDs]

    #Might have put in mixed IDs correspocting to different omics types still, give option to user;
    #Extract the translations, match the ones that intersect for different modes (e.g. RNA/Protein label (last element) must be same)
    membersWithAssociations = {}
    for gene, geneIDs in zip(listData, IDs):

        if len(gene)==4:
            geneKey, _, _, geneOmi = gene
        elif len(gene)==3:
            geneKey, geneOmi, _ = gene
        else:
            geneKey, geneOmi = gene

        if MultipleList:
            geneKey += '_' + geneOmi

        if geneKey in membersWithAssociations.keys():
            labels = membersWithAssociations[geneKey][0]
            if not geneOmi in labels:
                labels.append(geneOmi)
        else:
            membersWithAssociations[geneKey] = [[geneOmi],[]]

        for ID in geneIDs:
            if prefix + ID in list(AssignmentForwardDictionary.keys()):
                for AssignmentID in list(AssignmentForwardDictionary[prefix + ID]):
                    if not AssignmentID in membersWithAssociations[geneKey][1]:
                        membersWithAssociations[geneKey][1].append(AssignmentID)
        if len(membersWithAssociations[geneKey][1])==0:
            membersWithAssociations.pop(geneKey)

    allAssignmentIDs = []
    for thisGeneGOlist in [item[1] for item in list(membersWithAssociations.values())]:
        for AssignmentID in thisGeneGOlist:
            if not AssignmentID in allAssignmentIDs:
                allAssignmentIDs.append(AssignmentID)

    testCats = {}
    for AssignmentID in allAssignmentIDs:
        countsInList = len(membersWithAssociations.keys())
        countsInFamily = multiCorr*len(AssignmentReverseDictionary[AssignmentID])
        countsInMembers = np.sum([AssignmentID in item[1] for item in membersWithAssociations.values()])
        whereGeneHits = [AssignmentID in item[1] for item in membersWithAssociations.values()]
        listOfGenesHit = [[item, membersWithAssociations[item][0]] for item in np.array(list(membersWithAssociations.keys()))[whereGeneHits]]
        testValue = TestFunction(countsInList, countsInFamily, multiCorr*totalMembers, countsInMembers)

        testCats[AssignmentID] = [testValue, [countsInList, countsInFamily, multiCorr*totalMembers, countsInMembers], infoDict[AssignmentID], listOfGenesHit]

    correctedpValues = dict(zip(allAssignmentIDs, HypothesisFunction([item[0] for item in list(testCats.values())], pValueCutoff).T))

    for AssignmentID in allAssignmentIDs:
        testCats[AssignmentID][0] = [testCats[AssignmentID][0], correctedpValues[AssignmentID][1], correctedpValues[AssignmentID][2]]

    ResultsHCct = testCats

    #Length filter
    whatIsFilteredLength = ReportFilterFunction(np.array([item[1][3] for item in list(ResultsHCct.values())]), ReportFilter)

    #Significance filter
    whatIsFilteredSignif = np.array([(item[0][2] if FilterSignificant else True) for item in list(ResultsHCct.values())]).astype(bool)

    #Combined filter
    whatIsFiltered = whatIsFilteredLength * whatIsFilteredSignif

    returning = dict(zip(list(np.array(list(ResultsHCct.keys()), dtype=object)[whatIsFiltered]),list(np.array(list(ResultsHCct.values()), dtype=object)[whatIsFiltered])))

    return {list(data.keys())[0]: returning}

[docs]def OBOGODictionary(FileURL="http://purl.obolibrary.org/obo/go/go-basic.obo", ImportDirectly=False, PyIOmicaDataDirectory=None, OBOFile="goBasicObo.txt"):

    """Generate Open Biomedical Ontologies (OBO) Gene Ontology (GO) vocabulary dictionary.
    
    Parameters: 
        FileURL: str, Default "http://purl.obolibrary.org/obo/go/go-basic.obo"
            Provides the location of the Open Biomedical Ontologies (OBO) Gene Ontology (GO) 
            file in case this will be downloaded from the web

        ImportDirectly: boolean, Default False
            Import from URL regardles is the file already exists

        PyIOmicaDataDirectory: str, Default None
            Path of directories to data storage

        OBOFile: str, Default "goBasicObo.txt"
            Name of file to store data in (file will be zipped)

    Returns:
        dictionary
            Dictionary of definitions

    Usage:
        OBODict = OBOGODictionary()
    """

    global ConstantPyIOmicaDataDirectory

    PyIOmicaDataDirectory = ConstantPyIOmicaDataDirectory if PyIOmicaDataDirectory==None else PyIOmicaDataDirectory
    fileGOOBO = os.path.join(PyIOmicaDataDirectory, OBOFile)
    fileGOOBOgz = fileGOOBO + '.gz'

    #import the GO OBO file: we check if the OBO file Exist, if not, attempt to download and create it
    if not os.path.isfile(fileGOOBOgz):
        print("Did Not Find Annotation Files, Attempting to Download...")
        ImportDirectly = True

    if os.path.isfile(fileGOOBO):
        os.remove(fileGOOBO)

    if ImportDirectly:
        if os.path.isfile(fileGOOBOgz):
            os.remove(fileGOOBOgz)

        urllib.request.urlretrieve(FileURL.strip('"'), fileGOOBO)

        if os.path.isfile(fileGOOBO):
            print("Created Annotation Files at ", fileGOOBO)
        else:
            print("Did Not Find Annotation Files, Aborting Process")
            return

        with open(fileGOOBO, 'rb') as fileIn:
            with gzip.open(fileGOOBOgz, 'wb') as fileOut:
                shutil.copyfileobj(fileIn, fileOut)

        print("Compressed local file with GZIP.")

        os.remove(fileGOOBO)

    with gzip.open(fileGOOBOgz, 'r') as tempFile:
        inputFile = tempFile.readlines()

    inputFile = [item.decode() for item in inputFile]

    #Find keys "accessions (id):" and "name:" and "namespace" but extract their corresponding values in a list and map them to their corresponding [Term] positions, 
    #Once the "accessions (id):" and its corresponding "name:" in a list, make an association between them,
    #so you can search this association using the key "accessions (id):" to get the value "name:" and "namespace"
    outDictionary = {}

    for position in np.where([item=='[Term]\n'for item in inputFile])[0]:
        
        def getValue(index):

            return inputFile[position + index].strip(['id:', 'name:', 'namespace:'][index - 1]).strip('\n').strip()

        outDictionary[getValue(1)] = [getValue(2), getValue(3)]
    
    return outDictionary

[docs]def GetGeneDictionary(geneUCSCTable = None, UCSCSQLString = None, UCSCSQLSelectLabels = None,
                    ImportDirectly = False, Species = "human", KEGGUCSCSplit = [True,"KEGG Gene ID"]):
    
    """Create an ID/accession dictionary from a UCSC search - typically of gene annotations.
    
    Parameters: 
        geneUCSCTable: str, Default None
            Path to a geneUCSCTable file

        UCSCSQLString: str, Default None
            An association to be used to obtain data from the UCSC Browser tables. The key of the association must 
            match the Species option value used (default: human). The value for the species corresponds to the actual MySQL command used

        UCSCSQLSelectLabels: str, Default None
            An association to be used to assign key labels for the data imported from the UCSC Browser tables. 
            The key of the association must match the Species option value used (default: human). The value is a multi component string 
            list corresponding to the matrices in the data file, or the tables used in the MySQL query provided by UCSCSQLString

        ImportDirectly: boolean, Default False
            Import from URL regardles is the file already exists

        Species: str, Default "human"
            Species considered in the calculation, by default corresponding to human

        KEGGUCSCSplit: list, Default [True,"KEGG Gene ID"]
            Two component list, {True/False, label}. If the first component is set to True the initially imported KEGG IDs, 
            identified by the second component label,  are split on + string to fix nomenclature issues, retaining the string following +

    Returns:
        dictionary
            Gene dictionary

    Usage:
        geneDict = GetGeneDictionary()
    """
       
    UCSCSQLSelectLabels = {"human": ["UCSC ID", "UniProt ID", "Gene Symbol", 
        "RefSeq ID", "NCBI Protein Accession", "Ensembl ID", 
        "KEGG Gene ID", "HGU133Plus2 Affymetrix ID"]}

    #Update these to match any change in query
    #Query for UCSC SQL server
    UCSCSQLString = {"human": 
       "SELECT hg19.kgXref.kgID, hg19.kgXref.spID, \
        hg19.kgXref.geneSymbol, hg19.kgXref.refseq, hg19.kgXref.protAcc, \
        hg19.knownToEnsembl.value, hg19.knownToKeggEntrez.keggEntrez, \
        hg19.knownToU133Plus2.value FROM hg19.kgXref LEFT JOIN \
        hg19.knownToEnsembl ON hg19.kgXref.kgID = hg19.knownToEnsembl.name \
        LEFT JOIN hg19.knownToKeggEntrez ON hg19.kgXref.kgID = \
        hg19.knownToKeggEntrez.name LEFT JOIN hg19.knownToU133Plus2 ON \
        hg19.kgXref.kgID = hg19.knownToU133Plus2.name"}

    global ConstantPyIOmicaDataDirectory

    PyIOmicaDataDirectory = ConstantPyIOmicaDataDirectory

    if geneUCSCTable is None:
        geneUCSCTable = os.path.join(PyIOmicaDataDirectory, Species + "GeneUCSCTable" + ".json.gz")
       
    #If the user asked us to import directly, import directly with SQL, otherwise, get it from a directory they specify
    if not os.path.isfile(geneUCSCTable):
        print("Did Not Find Gene Translation Files, Attempting to Download from UCSC...")
        ImportDirectly = True
    else:
        termTable = dataStorage.read(geneUCSCTable, jsonFormat=True)[1]
        termTable = np.array(termTable)

    if ImportDirectly:
        #Connect to the database from UCSC
        ucscDatabase = pymysql.connect("genome-mysql.cse.ucsc.edu","genomep","password")

        if ucscDatabase==None:
            print("Could not establish connection to UCSC. Please try again or add the dictionary manually at ", geneUCSCTable)
            return

        #Prepare a cursor object using cursor() method
        ucscDatabaseCursor = ucscDatabase.cursor()

        try:
            #Execute the SQL command
            ucscDatabaseCursor.execute(UCSCSQLString[Species])

            #Fetch all the rows in a list of lists.
            termTable = ucscDatabaseCursor.fetchall()

        except Exception as exception:
            print(exception)
            print ("Error: unable to fetch data")

        termTable = np.array(termTable).T
        termTable[np.where(termTable=="")] = None

        #Get all the terms we are going to need, import with SQL the combined tables,and export with a time stamp
        dataStorage.write((datetime.datetime.now().isoformat(), termTable.tolist()), geneUCSCTable, jsonFormat=True)

        #Close SQL connection
        ucscDatabase.close()

        if os.path.isfile(geneUCSCTable):
            print("Created Annotation Files at ", geneUCSCTable)
        else:
            print("Did Not Find Annotation Files, Aborting Process")
            return

    returning = {Species : dict(zip(UCSCSQLSelectLabels[Species],termTable))}
    
    if KEGGUCSCSplit[0]:
        returning[Species][KEGGUCSCSplit[1]] = np.array([item.split("+")[1] if item!=None else item for item in returning[Species][KEGGUCSCSplit[1]]])
    
    return returning

[docs]def GOAnalysisAssigner(PyIOmicaDataDirectory = None, ImportDirectly = False, BackgroundSet = [], Species = "human",
                        LengthFilter = None, LengthFilterFunction = np.greater_equal, GOFileName = None, GOFileColumns = [2, 5], 
                        GOURL = "http://current.geneontology.org/annotations/"):
    
    """Download and create gene associations and restrict to required background set.

    Parameters: 
        PyIOmicaDataDirectory: str, Default None
            The directory where the default package data is stored

        ImportDirectly: boolean, Default False
            Import from URL regardles is the file already exists

        BackgroundSet: list, Default []
            Background list to create annotation projection to limited background space, involves
            considering pathways/groups/sets and that provides a list of IDs (e.g. gene accessions) that should 
            be considered as the background for the calculation

        Species: str, Default "human"
            Species considered in the calculation, by default corresponding to human

        LengthFilterFunction: function, Default np.greater_equal
            Performs computations of membership in pathways/ontologies/groups/sets, 
            that specifies which function to use to filter the number of members a reported category has 
            compared to the number typically provided by LengthFilter 

        LengthFilter: int, Default None
            Argument for LengthFilterFunction

        GOFileName: str, Default None
            The name for the specific GO file to download from the GOURL if option ImportDirectly is set to True

        GOFileColumns: list, Default [2, 5]
            Columns to use for IDs and GO:accessions respectively from the downloaded GO annotation file, 
            used when ImportDirectly is set to True to obtain a new GO association file

        GOURL: str, Default "http://current.geneontology.org/annotations/"
            The location (base URL) where the GO association annotation files are downloaded from

    Returns:
        dictionary
            Dictionary of IDToGO and GOToID dictionaries

    Usage:
        GOassignment = GOAnalysisAssigner()
    """

    global ConstantPyIOmicaDataDirectory

    PyIOmicaDataDirectory = ConstantPyIOmicaDataDirectory if PyIOmicaDataDirectory==None else PyIOmicaDataDirectory

    #If the user asked us to import PyIOmicaDataDirectoryectly, import PyIOmicaDataDirectoryectly from GO website, otherwise, get it from a PyIOmicaDataDirectoryectory they specify
    file = "goa_" + Species + ".gaf.gz" if GOFileName==None else GOFileName
    localFile =  os.path.join(PyIOmicaDataDirectory, "goa_" + Species + ".gaf")
    localZipFile =  os.path.join(PyIOmicaDataDirectory, "goa_" + Species + ".gaf.gz")
    fileGOAssociations = [os.path.join(PyIOmicaDataDirectory, Species + item + ".json.gz") for item in ["GeneOntAssoc", "IdentifierAssoc"]]

    #We check if the Annotations exist, if not, attempt to download and create them
    if not np.array(list(map(os.path.isfile, fileGOAssociations))).all():
        print("Did Not Find Annotation Files, Attempting to Download...")
        ImportDirectly = True

    if ImportDirectly:
        #Delete existing file
        if os.path.isfile(localFile):
            os.remove(localFile)
        
        urllib.request.urlretrieve(GOURL + file, "\\".join([localZipFile]))
        
        with gzip.open(localZipFile, 'rb') as fileIn:
            with open(localFile, 'wb') as fileOut:
                shutil.copyfileobj(fileIn, fileOut)
        
        #Clean up archive
        os.remove(localZipFile)
        
        with open(localFile, 'r') as tempFile:
            goData = tempFile.readlines()

        #Remove comments by "!" lines
        goData = np.array(goData[np.where(np.array([line[0]!='!' for line in goData]))[0][0]:])
        goData = pd.DataFrame([item.strip('\n').split('\t') for item in goData]).values

        #Remove all entries with missing
        df = pd.DataFrame(goData.T[np.array(GOFileColumns)-1].T)
        df = df[np.count_nonzero(df.values!='', axis=1)==2]

        ##Mark missing values with NaN
        #df.iloc[np.where(df.values=='')] = np.nan

        dfGrouped = df.groupby(df.columns[1])

        keys, values = list(dfGrouped.indices.keys()), list(dfGrouped.indices.values())
        IDs = df.values.T[0]
        geneOntAssoc = dict(zip(keys, [np.unique(IDs[value]).tolist() for value in values]))

        identifierAssoc = utilityFunctions.createReverseDictionary(geneOntAssoc)

        #Save created annotations geneOntAssoc, identifierAssoc
        dataStorage.write((datetime.datetime.now().isoformat(), geneOntAssoc), fileGOAssociations[0], jsonFormat=True)
        dataStorage.write((datetime.datetime.now().isoformat(), identifierAssoc), fileGOAssociations[1], jsonFormat=True)
        
        os.remove(localFile)

        if np.array(list(map(os.path.isfile, fileGOAssociations))).all():
            print("Created Annotation Files at ", fileGOAssociations)
        else:
            print("Did Not Find Annotation Files, Aborting Process")
            return
    else:
        #Otherwise we get from the user specified PyIOmicaDataDirectoryectory
        geneOntAssoc = dataStorage.read(fileGOAssociations[0], jsonFormat=True)[-1]
        identifierAssoc = dataStorage.read(fileGOAssociations[1], jsonFormat=True)[-1]

    if BackgroundSet!=[]:
        #Using provided background list to create annotation projection to limited background space, also remove entries with only one and missing value
        keys, values = np.array(list(identifierAssoc.keys())), np.array(list(identifierAssoc.values()))
        index = np.where([(((len(values[i])==True)*(values[i][0]!=values[i][0]))==False)*(keys[i] in BackgroundSet) for i in range(len(keys))])[0]
        identifierAssoc = dict(zip(keys[index],values[index]))

        #Create corresponding geneOntAssoc
        geneOntAssoc = utilityFunctions.createReverseDictionary(identifierAssoc)

    if LengthFilter!=None:
        keys, values = np.array(list(geneOntAssoc.keys()), dtype=object), np.array(list(geneOntAssoc.values()), dtype=object)
        index = np.where(LengthFilterFunction(np.array([len(value) for value in values]), LengthFilter))[0]
        geneOntAssoc = dict(zip(keys[index],values[index]))

        #Create corresponding identifierAssoc
        identifierAssoc = utilityFunctions.createReverseDictionary(geneOntAssoc)

    return {Species : {"IDToGO": identifierAssoc, "GOToID": geneOntAssoc}}

[docs]def obtainConstantGeneDictionary(GeneDictionary, GetGeneDictionaryOptions, AugmentDictionary):
    
    """Obtain gene dictionary - if it exists can either augment with new information or Species or create new, 
    if not exist then create variable.

    Parameters:
        GeneDictionary: dictionary or None
            An existing variable to use as a gene dictionary in annotations. 
            If set to None the default ConstantGeneDictionary will be used

        GetGeneDictionaryOptions: dictionary
            A list of options that will be passed to this internal GetGeneDictionary function

        AugmentDictionary: boolean
            A choice whether or not to augment the current ConstantGeneDictionary global variable or create a new one

    Returns:
        None

    Usage:
        obtainConstantGeneDictionary(None, {}, False)
    """

    global ConstantGeneDictionary
    
    if ConstantGeneDictionary!=None:
        #ConstantGeneDictionary exists
        if AugmentDictionary:
            #Augment ConstantGeneDictionary
            ConstantGeneDictionary = {**ConstantGeneDictionary, **(GetGeneDictionary(**GetGeneDictionaryOptions) if GeneDictionary==None else GeneDictionary)}
        else:
            #Replace ConstantGeneDictionary
            ConstantGeneDictionary = GetGeneDictionary(**GetGeneDictionaryOptions) if GeneDictionary==None else GeneDictionary
    else:
        #Create/load UCSC based translation dictionary - NB global variable or use specified variable
        ConstantGeneDictionary = GetGeneDictionary(**GetGeneDictionaryOptions) if GeneDictionary==None else GeneDictionary

    return None

[docs]def GOAnalysis(data, GetGeneDictionaryOptions={}, AugmentDictionary=True, InputID=["UniProt ID","Gene Symbol"], OutputID="UniProt ID",
                 GOAnalysisAssignerOptions={}, BackgroundSet=[], Species="human", OntologyLengthFilter=2, ReportFilter=1, ReportFilterFunction=np.greater_equal,
                 pValueCutoff=0.05, TestFunction=lambda n, N, M, x: 1. - scipy.stats.hypergeom.cdf(x-1, M, n, N), 
                 HypothesisFunction=lambda data, SignificanceLevel: BenjaminiHochbergFDR(data, SignificanceLevel=SignificanceLevel)["Results"], 
                 FilterSignificant=True, OBODictionaryVariable=None,
                 OBOGODictionaryOptions={}, MultipleListCorrection=None, MultipleList=False, GeneDictionary=None):

    """Calculate input data over-representation analysis for Gene Ontology (GO) categories.

    Parameters:
        data: pd.DataFrame or list
            Data to analyze

        GetGeneDictionaryOptions: dictionary, Default {}
            A list of options that will be passed to this internal GetGeneDictionary function

        AugmentDictionary: boolean, Default True
            A choice whether or not to augment the current ConstantGeneDictionary global variable or create a new one

        InputID: list, Default ["UniProt ID","Gene Symbol"]
            Kind of identifiers/accessions used as input

        OutputID: str, Default "UniProt ID"
            Kind of IDs/accessions to convert the input IDs/accession numbers in the function's analysis

        GOAnalysisAssignerOptions: dictionary, Default {}
            A list of options that will be passed to the internal GOAnalysisAssigner function

        BackgroundSet: list, Default []
            Background list to create annotation projection to limited background space, involves
            considering pathways/groups/sets and that provides a list of IDs (e.g. gene accessions) that should be 
            considered as the background for the calculation

        Species: str, Default "human"
            The species considered in the calculation, by default corresponding to human

        OntologyLengthFilter: int, Default 2
            Function that can be used to set the value for which terms to consider in the computation, 
            by excluding GO terms that have fewer items compared to the OntologyLengthFilter value. It is used by the internal
            GOAnalysisAssigner function

        ReportFilter: int, Default 1
            Functions that use pathways/ontologies/groups, and provides a cutoff for membership in ontologies/pathways/groups
            in selecting which terms/categories to return. It is typically used in conjunction with ReportFilterFunction

        ReportFilterFunction: function , Default np.greater_equal
            Specifies what operator form will be used to compare against ReportFilter option value in 
            selecting which terms/categories to return

        pValueCutoff: float, Default 0.05
            Significance cutoff

        TestFunction: function, Default lambda n, N, M, x: 1. - scipy.stats.hypergeom.cdf(x-1, M, n, N)
            Test function

        HypothesisFunction: function, Default lambda data, SignificanceLevel: BenjaminiHochbergFDR(data, SignificanceLevel=SignificanceLevel)["Results"]
            Allows the choice of function for implementing multiple hypothesis testing considerations

        FilterSignificant: boolean, Default True
            Can be set to True to filter data based on whether the analysis result is statistically significant, 
            or if set to False to return all membership computations

        OBODictionaryVariable: str, Default None
            A GO annotation variable. If set to None, OBOGODictionary will be used internally to 
            automatically generate the default GO annotation

        OBOGODictionaryOptions: dictionary, Default {}
            A list of options to be passed to the internal OBOGODictionary function that provides the GO annotations

        MultipleListCorrection: boolean, Default None
            Specifies whether or not to correct for multi-omics analysis. The choices are None, Automatic, 
            or a custom number, e.g protein+RNA

        MultipleList: boolean, Default False
            Specifies whether the input accessions list constituted a multi-omics list input that is annotated so

        GeneDictionary: str, Default None
            Points to an existing variable to use as a gene dictionary in annotations. If set to None 
            the default ConstantGeneDictionary will be used

    Returns:
        dictionary
            Enrichment dictionary

    Usage:
        goExample1 = GOAnalysis(["TAB1", "TNFSF13B", "MALT1", "TIRAP", "CHUK", 
                                "TNFRSF13C", "PARP1", "CSNK2A1", "CSNK2A2", "CSNK2B", "LTBR", 
                                "LYN", "MYD88", "GADD45B", "ATM", "NFKB1", "NFKB2", "NFKBIA", 
                                "IRAK4", "PIAS4", "PLAU"])
    """

    global ConstantGeneDictionary

    #Obtain OBO dictionary with OBOGODictionaryOptions if any. If externally defined use user definition for OBODict Variable
    OBODict = OBOGODictionary(**OBOGODictionaryOptions) if OBODictionaryVariable==None else OBODictionaryVariable

    #Obtain gene dictionary - if it exists can either augment with new information or Species or create new, if not exist then create variable
    obtainConstantGeneDictionary(GeneDictionary, GetGeneDictionaryOptions, AugmentDictionary)
    
    #Get the right GO terms for the BackgroundSet requested and correct Species
    Assignment = GOAnalysisAssigner(BackgroundSet=BackgroundSet, Species=Species , LengthFilter=OntologyLengthFilter) if GOAnalysisAssignerOptions=={} else GOAnalysisAssigner(**GOAnalysisAssignerOptions)
    
    listToggle = False

    #If the input is simply a list
    if type(data) is list:
        data = {'dummy': data}
        listToggle = True

    #The data may be a subgroup from a clustering object, i.e. a pd.DataFrame
    if type(data) is pd.DataFrame:
        id = list(data.index.get_level_values('id'))
        source = list(data.index.get_level_values('source'))
        data = [[id[i], source[i]] for i in range(len(data))]
        data = {'dummy': data}
        listToggle = True

    returning = {}

    #Check if a clustering object
    if "linkage" in data.keys():
        if MultipleListCorrection==None:
            multiCorr = 1
        elif MultipleListCorrection=='Automatic':
            multiCorr = 1
            for keyGroup in sorted([item for item in list(data.keys()) if not item=='linkage']):
                for keySubGroup in sorted([item for item in list(data[keyGroup].keys()) if not item=='linkage']):
                    multiCorr = max(max(np.unique(data[keyGroup][keySubGroup]['data'].index.get_level_values('id'), return_counts=True)[1]), multiCorr)
        else:
            multiCorr = MultipleListCorrection

        #Loop through the clustering object, calculate GO for each SubGroup
        for keyGroup in sorted([item for item in list(data.keys()) if not item=='linkage']):
            returning[keyGroup] = {}
            for keySubGroup in sorted([item for item in list(data[keyGroup].keys()) if not item=='linkage']):
                SubGroupMultiIndex = data[keyGroup][keySubGroup]['data'].index
                SubGroupGenes = list(SubGroupMultiIndex.get_level_values('id'))
                SubGroupMeta = list(SubGroupMultiIndex.get_level_values('source'))
                SubGroupList = [[SubGroupGenes[i], SubGroupMeta[i]] for i in range(len(SubGroupMultiIndex))]

                returning[keyGroup][keySubGroup] = internalAnalysisFunction({keySubGroup:SubGroupList},
                                                                     multiCorr, MultipleList,  OutputID, InputID, Species, len(Assignment[Species]["IDToGO"]),
                                                                     pValueCutoff, ReportFilterFunction, ReportFilter, TestFunction, HypothesisFunction, FilterSignificant,
                                                                     AssignmentForwardDictionary=Assignment[Species]['IDToGO'],
                                                                     AssignmentReverseDictionary=Assignment[Species]['GOToID'],
                                                                     prefix='', infoDict=OBODict)[keySubGroup]

    #The data is a dictionary of type {'Name1': [data1], 'Name2': [data2], ...}
    else:
        for key in list(data.keys()):
            if MultipleListCorrection==None:
                multiCorr = 1
            elif MultipleList and MultipleListCorrection=='Automatic':
                multiCorr = max(np.unique([item[0] for item in data[key]], return_counts=True)[1])
            else:
                multiCorr = MultipleListCorrection

            returning.update(internalAnalysisFunction({key:data[key]}, multiCorr, MultipleList,  OutputID, InputID, Species, len(Assignment[Species]["IDToGO"]),
                                                pValueCutoff, ReportFilterFunction, ReportFilter, TestFunction, HypothesisFunction, FilterSignificant,
                                                AssignmentForwardDictionary=Assignment[Species]['IDToGO'],
                                                AssignmentReverseDictionary=Assignment[Species]['GOToID'],
                                                prefix='', infoDict=OBODict))

        #If a single list was provided, return the association for Gene Ontologies
        returning = returning['dummy'] if listToggle else returning

    return returning

[docs]def GeneTranslation(InputList, TargetIDList, GeneDictionary, InputID = None, Species = "human"):

    """Use geneDictionary to convert inputList IDs to different annotations as indicated by targetIDList.
    
    Parameters:
        InputList: list
            List of names

        TargetIDList: list
            Target ID list

        GeneDictionary: dictionary
            An existing variable to use as a gene dictionary in annotations.
            If set to None the default ConstantGeneDictionary will be used

        InputID: str, Default None
            The kind of identifiers/accessions used as input

        Species: str, Default "human"
            The species considered in the calculation, by default corresponding to human
    
    Returns:
        dictionary
            Gene dictionary

    Usage:
        GenDict = GeneTranslation(data, "UniProt ID", ConstantGeneDictionary, InputID = ["UniProt ID","Gene Symbol"],  Species = "human")
    
    """

    if InputID!=None:
        listOfKeysToUse = []
        if type(InputID) is list:
            for key in InputID:
                if key in list(GeneDictionary[Species].keys()):
                    listOfKeysToUse.append(key)
        elif type(InputID) is str:
            listOfKeysToUse.append(InputID)
    else:
        listOfKeysToUse = list(GeneDictionary[Species].keys())
    
    returning = {}

    for TargetID in ([TargetIDList] if type(TargetIDList) is str else TargetIDList):
        returning[TargetID] = {}
        for key in listOfKeysToUse:
            returning[TargetID][key] = []
            for item in InputList:
                allEntries = np.array(GeneDictionary[Species][TargetID])[np.where(np.array(GeneDictionary[Species][key])==item)[0]]
                returning[TargetID][key].append(list(np.unique(allEntries[np.where(allEntries!=None)[0]]) if InputID!=None else allEntries))
                
        #Merge all found lists into one list
        if InputID!=None:
            returningCopy = returning.copy()
            returning[TargetID] = []
            for iitem, item in enumerate(InputList):
                tempList = []
                for key in listOfKeysToUse:
                    tempList.extend(returningCopy[TargetID][key][iitem])
                returning[TargetID].append(tempList)
                    
    return returning

[docs]def KEGGAnalysisAssigner(PyIOmicaDataDirectory = None, ImportDirectly = False, BackgroundSet = [], KEGGQuery1 = "pathway", KEGGQuery2 = "hsa",
                        LengthFilter = None, LengthFilterFunction = np.greater_equal, Labels = ["IDToPath", "PathToID"]):

    """Create KEGG: Kyoto Encyclopedia of Genes and Genomes pathway associations, 
    restricted to required background set, downloading the data if necessary.

    Parameters: 
        PyIOmicaDataDirectory: str, Default None
            Directory where the default package data is stored

        ImportDirectly: boolean, Default False
            Import from URL regardles is the file already exists

        BackgroundSet: list, Default []
            A list of IDs (e.g. gene accessions) that should be considered as the background for the calculation

        KEGGQuery1: str, Default "pathway"
            Make KEGG API calls, and sets string query1 in http://rest.kegg.jp/link/<> query1 <> / <> query2. 
            Typically this will be used as the target database to find related entries by using database cross-references

        KEGGQuery2: str, Default "hsa"
            KEGG API calls, and sets string query2 in http://rest.kegg.jp/link/<> query1 <> / <> query2. 
            Typically this will be used as the source database to find related entries by using database cross-references

        LengthFilterFunction: function, Default np.greater_equal
            Option for functions that perform computations of membership in 
            pathways/ontologies/groups/sets, that specifies which function to use to filter the number of members a reported 
            category has compared to the number typically provided by LengthFilter

        LengthFilter: int, Default None
            Allows the selection of how many members each category can have, as typically 
            restricted by the LengthFilterFunction

        Labels: list, Default ["IDToPath", "PathToID"]
            A string list for how keys in a created association will be named

    Returns:
        dictionary
            IDToPath and PathToID dictionary

    Usage:
        KEGGassignment = KEGGAnalysisAssigner()
    """
    
    global ConstantPyIOmicaDataDirectory
    
    PyIOmicaDataDirectory = ConstantPyIOmicaDataDirectory if PyIOmicaDataDirectory==None else PyIOmicaDataDirectory

    ##if the user asked us to import directly, import directly from KEGG website, otherwise, get it from a directory they specify
    fileAssociations = [os.path.join(PyIOmicaDataDirectory, item) for item in [KEGGQuery1 + "_" + KEGGQuery2 + "KEGGMemberToPathAssociation.json.gz", 
                                                                                KEGGQuery1 + "_" + KEGGQuery2 + "KEGGPathToMemberAssociation.json.gz"]]

    if not np.array(list(map(os.path.isfile, fileAssociations))).all():
        print("Did Not Find Annotation Files, Attempting to Download...")
        ImportDirectly = True

    if ImportDirectly:
        localFile = os.path.join(PyIOmicaDataDirectory, KEGGQuery1 + "_" + KEGGQuery2 + ".tsv")
        #Delete existing file
        if os.path.isfile(localFile):
            os.remove(localFile)

        urllib.request.urlretrieve("http://rest.kegg.jp/link/" + KEGGQuery1 + ("" if KEGGQuery2=="" else "/" + KEGGQuery2), localFile)

        with open(localFile, 'r') as tempFile:
            tempLines = tempFile.readlines()
            
        df = pd.DataFrame([line.strip('\n').split('\t') for line in tempLines])

        ##Remove all entries with missing
        #df = pd.DataFrame(goData.T[np.array(GOFileColumns)-1].T)
        #df = df[np.count_nonzero(df.values!='', axis=1)==2]

        dfGrouped = df.groupby(df.columns[1])
        keys, values = list(dfGrouped.indices.keys()), list(dfGrouped.indices.values())
        IDs = df.values.T[0]
        pathToID = dict(zip(keys, [np.unique(IDs[value]).tolist() for value in values]))
        idToPath = utilityFunctions.createReverseDictionary(pathToID)

        dataStorage.write((datetime.datetime.now().isoformat(), idToPath), fileAssociations[0], jsonFormat=True)
        dataStorage.write((datetime.datetime.now().isoformat(), pathToID), fileAssociations[1], jsonFormat=True)

        os.remove(localFile)

        if np.array(list(map(os.path.isfile, fileAssociations))).all():
            print("Created Annotation Files at ", fileAssociations)
        else:
            print("Did Not Find Annotation Files, Aborting Process")
            return
    else:
        #otherwise import the necessary associations from PyIOmicaDataDirectoryectory
        idToPath = dataStorage.read(fileAssociations[0], jsonFormat=True)[1]
        pathToID = dataStorage.read(fileAssociations[1], jsonFormat=True)[1]

    if BackgroundSet!=[]:
        #Using provided background list to create annotation projection to limited background space
        keys, values = np.array(list(idToPath.keys())), np.array(list(idToPath.values()))
        index = np.where([(((len(values[i])==True)*(values[i][0]!=values[i][0]))==False)*(keys[i] in BackgroundSet) for i in range(len(keys))])[0]
        idToPath = dict(zip(keys[index],values[index]))

        #Create corresponding reverse dictionary
        pathToID = utilityFunctions.createReverseDictionary(idToPath)

    if LengthFilter!=None:
        keys, values = np.array(list(pathToID.keys())), np.array(list(pathToID.values()))
        index = np.where(LengthFilterFunction(np.array([len(value) for value in values]), LengthFilter))[0]
        pathToID = dict(zip(keys[index],values[index]))

        #Create corresponding reverse dictionary
        idToPath = utilityFunctions.createReverseDictionary(pathToID)

    return {KEGGQuery2 : {Labels[0]: idToPath, Labels[1]: pathToID}}

[docs]def KEGGDictionary(PyIOmicaDataDirectory = None, ImportDirectly = False, KEGGQuery1 = "pathway", KEGGQuery2 = "hsa"):

    """Create a dictionary from KEGG: Kyoto Encyclopedia of Genes and Genomes terms - 
    typically association of pathways and members therein.
    
    Parameters: 
        PyIOmicaDataDirectory: str, Default None
            directory where the default package data is stored

        ImportDirectly: boolean, Default False
            import from URL regardles is the file already exists

        KEGGQuery1: str, Default "pathway"
            make KEGG API calls, and sets string query1 in http://rest.kegg.jp/link/<> query1 <> / <> query2. 
            Typically this will be used as the target database to find related entries by using database cross-references

        KEGGQuery2: str, Default "hsa"
            KEGG API calls, and sets string query2 in http://rest.kegg.jp/link/<> query1 <> / <> query2. 
            Typically this will be used as the source database to find related entries by using database cross-references

    Returns:
        dictionary
            Dictionary of definitions

    Usage:
        KEGGDict = KEGGDictionary()
    """
    
    global ConstantPyIOmicaDataDirectory
    
    PyIOmicaDataDirectory = ConstantPyIOmicaDataDirectory if PyIOmicaDataDirectory==None else PyIOmicaDataDirectory

    #if the user asked us to import directly, import directly from KEGG website, otherwise, get it from a directory they specify
    fileKEGGDict = os.path.join(PyIOmicaDataDirectory, KEGGQuery1 + "_" + KEGGQuery2 + "_KEGGDictionary.json.gz")

    if os.path.isfile(fileKEGGDict):
        associationKEGG = dataStorage.read(fileKEGGDict, jsonFormat=True)[1]
    else:
        print("Did Not Find Annotation Files, Attempting to Download...")
        ImportDirectly = True

    if ImportDirectly:
        queryFile = os.path.join(PyIOmicaDataDirectory, KEGGQuery1 + "_" + KEGGQuery2 + ".tsv")

        if os.path.isfile(queryFile): 
            os.remove(queryFile)

        urllib.request.urlretrieve("http://rest.kegg.jp/list/" + KEGGQuery1 + ("" if KEGGQuery2=="" else "/" + KEGGQuery2), queryFile)

        with open(queryFile, 'r') as tempFile:
            tempLines = tempFile.readlines()
            
        os.remove(queryFile)
        
        associationKEGG = dict([line.strip('\n').split('\t') for line in tempLines])

        dataStorage.write((datetime.datetime.now().isoformat(), associationKEGG), fileKEGGDict, jsonFormat=True)

        if os.path.isfile(fileKEGGDict):
            print("Created Annotation Files at ", fileKEGGDict)
        else:
            print("Did Not Find Annotation Files, Aborting Process")
            return

    return associationKEGG

[docs]def KEGGAnalysis(data, AnalysisType = "Genomic", GetGeneDictionaryOptions = {}, AugmentDictionary = True, InputID = ["UniProt ID", "Gene Symbol"],
                OutputID = "KEGG Gene ID", MolecularInputID = ["cpd"], MolecularOutputID = "cpd", KEGGAnalysisAssignerOptions = {}, BackgroundSet = [], 
                KEGGOrganism = "hsa", KEGGMolecular = "cpd", KEGGDatabase = "pathway", PathwayLengthFilter = 2, ReportFilter = 1, 
                ReportFilterFunction = np.greater_equal, pValueCutoff = 0.05, TestFunction = lambda n, N, M, x: 1. - scipy.stats.hypergeom.cdf(x-1, M, n, N), 
                HypothesisFunction = lambda data, SignificanceLevel: BenjaminiHochbergFDR(data, SignificanceLevel=SignificanceLevel)["Results"],
                FilterSignificant = True, KEGGDictionaryVariable = None, KEGGDictionaryOptions = {}, MultipleListCorrection = None, MultipleList = False, 
                GeneDictionary = None, Species = "human", MolecularSpecies = "compound", NonUCSC = False, PyIOmicaDataDirectory = None):

    """Calculate input data over-representation analysis for KEGG: Kyoto Encyclopedia of Genes and Genomes pathways.
    Input can be a list, a dictionary of lists or a clustering object.

    Parameters:
        data: pandas.DetaFrame or list
            Data to analyze

        AnalysisType: str, Default "Genomic"
            Analysis methods that may be used, "Genomic", "Molecular" or "All"

        GetGeneDictionaryOptions: dictionary, Default {}
            A list of options that will be passed to this internal GetGeneDictionary function

        AugmentDictionary: boolean, Default True
            A choice whether or not to augment the current ConstantGeneDictionary global variable or create a new one

        InputID: list, Default ["UniProt ID", "Gene Symbol"]
            The kind of identifiers/accessions used as input

        OutputID: str, Default "KEGG Gene ID"
            A string value that specifies what kind of IDs/accessions to convert the input IDs/accession 
            numbers in the function's analysis

        MolecularInputID: list, Default ["cpd"]
            A string list to indicate the kind of ID to use for the input molecule entries

        MolecularOutputID: str, Default "cpd"
            A string list to indicate the kind of ID to use for the input molecule entries

        KEGGAnalysisAssignerOptions: dictionary, Default {}
            A list of options that will be passed to this internal KEGGAnalysisAssigner function

        BackgroundSet: list, Default []
            A list of IDs (e.g. gene accessions) that should be considered as the background for the calculation

        KEGGOrganism: str, Default "hsa"
            Indicates which organism (org) to use for \"Genomic\" type of analysis (default is human analysis: org=\"hsa\")

        KEGGMolecular: str, Default "cpd"
            Which database to use for molecular analysis (default is the compound database: cpd)

        KEGGDatabase: str, Default "pathway"
            KEGG database to use as the target database

        PathwayLengthFilter: int, Default 2
            Pathways to consider in the computation, by excluding pathways that have fewer items 
            compared to the PathwayLengthFilter value

        ReportFilter: int, Default 1
            Provides a cutoff for membership in ontologies/pathways/groups in selecting which terms/categories 
            to return. It is typically used in conjunction with ReportFilterFunction

        ReportFilterFunction: function, Default np.greater_equal
            Operator form will be used to compare against ReportFilter option value in selecting 
            which terms/categories to return

        pValueCutoff: float, Default 0.05
            A cutoff p-value for (adjusted) p-values to assess statistical significance

        TestFunction: function, Default lambda n, N, M, x: 1. - scipy.stats.hypergeom.cdf(x-1, M, n, N)
            A function used to calculate p-values

        HypothesisFunction: function, Default lambda data, SignificanceLevel: BenjaminiHochbergFDR(data, SignificanceLevel=SignificanceLevel)["Results"]
            Allows the choice of function for implementing multiple hypothesis testing considerations

        FilterSignificant: boolean, Default True
            Can be set to True to filter data based on whether the analysis result is statistically significant, 
            or if set to False to return all membership computations

        KEGGDictionaryVariable: str, Default None
            KEGG dictionary, and provides a KEGG annotation variable. If set to None, KEGGDictionary 
            will be used internally to automatically generate the default KEGG annotation

        KEGGDictionaryOptions: dictionary, Default {}
            A list of options to be passed to the internal KEGGDictionary function that provides the KEGG annotations

        MultipleListCorrection: boolean, Default None
            Specifies whether or not to correct for multi-omics analysis. 
            The choices are None, Automatic, or a custom number

        MultipleList: boolean, Default False 
            Whether the input accessions list constituted a multi-omics list input that is annotated so

        GeneDictionary: str, Default None
            Existing variable to use as a gene dictionary in annotations. If set to None the default ConstantGeneDictionary will be used

        Species: str, Default "human"
            The species considered in the calculation, by default corresponding to human

        MolecularSpecies: str, Default "compound"
            The kind of molecular input

        NonUCSC: , Default 
            If UCSC browser was used in determining an internal GeneDictionary used in ID translations,
            where the KEGG identifiers for genes are number strings (e.g. 4790).The NonUCSC option can be set to True 
            if standard KEGG accessions are used in a user provided GeneDictionary variable, 
            in the form OptionValue[KEGGOrganism] <>:<>numberString, e.g. hsa:4790

        PyIOmicaDataDirectory: str, Default None
            Directory where the default package data is stored

    Returns:
        dictionary
            Enrichment dictionary

    Usage:
        keggExample1 = KEGGAnalysis(["TAB1", "TNFSF13B", "MALT1", "TIRAP", "CHUK", "TNFRSF13C", "PARP1", "CSNK2A1", "CSNK2A2", "CSNK2B", "LTBR", "LYN", "MYD88", 
                                    "GADD45B", "ATM", "NFKB1", "NFKB2", "NFKBIA", "IRAK4", "PIAS4", "PLAU", "POLR3B", "NME1", "CTPS1", "POLR3A"])
    """

    argsLocal = locals().copy()

    global ConstantPyIOmicaDataDirectory

    obtainConstantGeneDictionary(None, {}, True)
    
    PyIOmicaDataDirectory = ConstantPyIOmicaDataDirectory if PyIOmicaDataDirectory==None else PyIOmicaDataDirectory

    #Gene Identifier based analysis
    if AnalysisType=="Genomic":
        #Obtain OBO dictionary. If externally defined use user definition for OBODict Var
        keggDict = KEGGDictionary(**KEGGDictionaryOptions) if KEGGDictionaryVariable==None else KEGGDictionaryVariable

        #Obtain gene dictionary - if it exists can either augment with new information or Species or create new, if not exist then create variable
        obtainConstantGeneDictionary(GeneDictionary, GetGeneDictionaryOptions, AugmentDictionary)

        #get the right KEGG terms for the BackgroundSet requested and correct Species
        Assignment = KEGGAnalysisAssigner(BackgroundSet=BackgroundSet, KEGGQuery1=KEGGDatabase, KEGGQuery2=KEGGOrganism, LengthFilter=PathwayLengthFilter) if KEGGAnalysisAssignerOptions=={} else KEGGAnalysisAssigner(**KEGGAnalysisAssignerOptions)

    #Molecular based analysis
    elif AnalysisType=="Molecular":
        InputID = MolecularInputID
        OutputID = MolecularOutputID
        Species = MolecularSpecies
        NonUCSC = True
        KEGGOrganism = KEGGMolecular
        MultipleListCorrection = None

        keggDict = KEGGDictionary(**({"KEGGQuery1": "pathway", "KEGGQuery2": ""} if KEGGDictionaryOptions=={} else KEGGDictionaryOptions)) if KEGGDictionaryVariable==None else KEGGDictionaryVariable

        #Obtain gene dictionary - if it exists can either augment with new information or Species or create new, if not exist then create variable
        fileMolDict = os.path.join(PyIOmicaDataDirectory, "PyIOmicaMolecularDictionary.json.gz")

        if os.path.isfile(fileMolDict):
            GeneDictionary = dataStorage.read(fileMolDict, jsonFormat=True)[1]
        else:
            fileCSV = os.path.join(PackageDirectory, "data", "MathIOmicaMolecularDictionary.csv")

            print('Attempting to read:', fileCSV)

            if os.path.isfile(fileCSV):
                with open(fileCSV, 'r') as tempFile:
                    tempLines = tempFile.readlines()
            
                tempData = np.array([line.strip('\n').replace('"', '').split(',') for line in tempLines]).T
                tempData = {'compound': {'pumchem': tempData[0].tolist(), 'cpd': tempData[1].tolist()}}
                dataStorage.write((datetime.datetime.now().isoformat(), tempData), fileMolDict, jsonFormat=True)
            else:
                print("Could not find annotation file at " + fileMolDict + " Please either obtain an annotation file from mathiomica.org or provide a GeneDictionary option variable.")
                return

            GeneDictionary = dataStorage.read(fileMolDict, jsonFormat=True)[1]

        obtainConstantGeneDictionary(GeneDictionary, {}, AugmentDictionary)

        #Get the right KEGG terms for the BackgroundSet requested and correct Species
        #If no specific options for function use BackgroundSet, Species request, length request
        Assignment = KEGGAnalysisAssigner(BackgroundSet=BackgroundSet, KEGGQuery1=KEGGDatabase, KEGGQuery2=KEGGOrganism , LengthFilter=PathwayLengthFilter) if KEGGAnalysisAssignerOptions=={} else KEGGAnalysisAssigner(**KEGGAnalysisAssignerOptions)
    
    #Gene Identifier and Molecular based analysis done concurrently
    elif AnalysisType=='All':
        argsMolecular = argsLocal.copy()
        argsMolecular['AnalysisType'] = 'Molecular'
        argsGenomic = argsLocal.copy()
        argsGenomic['AnalysisType'] = 'Genomic'

        return {"Molecular": KEGGAnalysis(**argsMolecular), "Genomic": KEGGAnalysis(**argsGenomic)}
    
    #Abort
    else:
        print("AnalysisType %s is not a valid choice."%AnalysisType)

        return

    listToggle = False

    #If the input is simply a list
    if type(data) is list:
        data = {'dummy': data}
        listToggle = True

    #The data may be a subgroup from a clustering object, i.e. a pd.DataFrame
    if type(data) is pd.DataFrame:
        id = list(data.index.get_level_values('id'))
        source = list(data.index.get_level_values('source'))
        data = [[id[i], source[i]] for i in range(len(data))]
        data = {'dummy': data}
        listToggle = True

    returning = {}

    #Check if a clustering object
    if "linkage" in data.keys():
        if MultipleListCorrection==None:
            multiCorr = 1
        elif MultipleListCorrection=='Automatic':
            multiCorr = 1
            for keyGroup in sorted([item for item in list(data.keys()) if not item=='linkage']):
                for keySubGroup in sorted([item for item in list(data[keyGroup].keys()) if not item=='linkage']):
                    multiCorr = max(max(np.unique(data[keyGroup][keySubGroup]['data'].index.get_level_values('id'), return_counts=True)[1]), multiCorr)
        else:
            multiCorr = MultipleListCorrection

        #Loop through the clustering object, calculate GO for each SubGroup
        for keyGroup in sorted([item for item in list(data.keys()) if not item=='linkage']):
            returning[keyGroup] = {}
            for keySubGroup in sorted([item for item in list(data[keyGroup].keys()) if not item=='linkage']):
                SubGroupMultiIndex = data[keyGroup][keySubGroup]['data'].index
                SubGroupGenes = list(SubGroupMultiIndex.get_level_values('id'))
                SubGroupMeta = list(SubGroupMultiIndex.get_level_values('source'))
                SubGroupList = [[SubGroupGenes[i], SubGroupMeta[i]] for i in range(len(SubGroupMultiIndex))]

                returning[keyGroup][keySubGroup] = internalAnalysisFunction({keySubGroup:SubGroupList},
                                                                             multiCorr, MultipleList,  OutputID, InputID, Species, len(Assignment[KEGGOrganism]["IDToPath"]),
                                                                             pValueCutoff, ReportFilterFunction, ReportFilter, TestFunction, HypothesisFunction, FilterSignificant,
                                                                             AssignmentForwardDictionary=Assignment[KEGGOrganism]['IDToPath'],
                                                                             AssignmentReverseDictionary=Assignment[KEGGOrganism]['PathToID'],
                                                                             prefix='hsa:' if AnalysisType=='Genomic' else '', infoDict=keggDict)[keySubGroup]

    #The data is a dictionary of type {'Name1': [data1], 'Name2': [data2], ...}
    else:
        for key in list(data.keys()):
            if MultipleListCorrection==None:
                multiCorr = 1
            elif MultipleList and MultipleListCorrection=='Automatic':
                multiCorr = max(np.unique([item[0] for item in data[key]], return_counts=True)[1])
            else:
                multiCorr = MultipleListCorrection

            returning.update(internalAnalysisFunction({key:data[key]}, multiCorr, MultipleList,  OutputID, InputID, Species, len(Assignment[KEGGOrganism]["IDToPath"]),
                                                pValueCutoff, ReportFilterFunction, ReportFilter, TestFunction, HypothesisFunction, FilterSignificant,
                                                AssignmentForwardDictionary=Assignment[KEGGOrganism]['IDToPath'],
                                                AssignmentReverseDictionary=Assignment[KEGGOrganism]['PathToID'],
                                                prefix='hsa:' if AnalysisType=='Genomic' else '', infoDict=keggDict))

        #If a single list was provided
        returning = returning['dummy'] if listToggle else returning

    return returning

[docs]def MassMatcher(data, accuracy, MassDictionaryVariable = None, MolecularSpecies = "cpd"):

    """Assign putative mass identification to input data based on monoisotopic mass 
    (using PyIOmica's mass dictionary). The accuracy in parts per million. 
    
    Parameters: 
        data: np.array
            Input data

        accuracy: float
            Accuracy

        MassDictionaryVariable: boolean, Default None
            Mass dictionary variable. If set to None, inbuilt 
            mass dictionary (MassDictionary) will be loaded and used

        MolecularSpecies: str, Default "cpd"
            The kind of molecular input

    Returns:
        list
            List of IDs 

    Usage:
       result = MassMatcher(18.010565, 2)
    """
    
    ppm = accuracy*(10**-6)

    MassDict = MassDictionary() if MassDictionaryVariable==None else MassDictionaryVariable
    keys, values = np.array(list(MassDict[MolecularSpecies].keys())), np.array(list(MassDict[MolecularSpecies].values()))

    return keys[np.where((values > data*(1 - ppm)) * (values < data*(1 + ppm)))[0]]

[docs]def MassDictionary(PyIOmicaDataDirectory=None):

    """Load PyIOmica's current mass dictionary.
    
    Parameters:
        PyIOmicaDataDirectory: str, Default None
            Directory where the default package data is stored

    Returns:
        dictionary
            Mass dictionary

    Usage:
        MassDict = MassDictionary()
    """

    global ConstantPyIOmicaDataDirectory

    PyIOmicaDataDirectory = ConstantPyIOmicaDataDirectory if PyIOmicaDataDirectory==None else PyIOmicaDataDirectory

    fileMassDict = os.path.join(PyIOmicaDataDirectory, "PyIOmicaMassDictionary.json.gz")

    if os.path.isfile(fileMassDict):
        MassDict = dataStorage.read(fileMassDict, jsonFormat=True)[1]
    else:
        fileCSV = os.path.join(PackageDirectory, "data", "MathIOmicaMassDictionary" +  ".csv")

        if False:
            with open("PyIOmicaMassDictionary", 'r') as tempFile:
                mathDictData = tempFile.readlines()

            mathDict = ''.join([line.strip('\n') for line in mathDictData]).replace('"','').replace(' ','').replace('->',' ').split(',')
            mathDict = np.array([line.split(' ') for line in mathDict])
            np.savetxt(fileCSV, mathDictData, delimiter=',', fmt='%s')

        if os.path.isfile(fileCSV):
            print('Reading:', fileCSV)

            fileMassDictData = np.loadtxt(fileCSV, delimiter=',', dtype=str)
            MassDict = {fileMassDictData[0][0].split(':')[0]: dict(zip(fileMassDictData.T[0],fileMassDictData.T[1].astype(float)))}
            dataStorage.write((datetime.datetime.now().isoformat(), MassDict), fileMassDict, jsonFormat=True)

            print("Created mass dictionary at ", fileMassDict)
        else:
            print("Could not find mass dictionary at ", fileMassDict, 
                    "Please either obtain a mass dictionary file from mathiomica.org or provide a custom file at the above location.")

            return None

    return MassDict

[docs]def ExportEnrichmentReport(data, AppendString="", OutputDirectory=None):

    """Export results from enrichment analysis to Excel spreadsheets.
    
    Parameters:
        data: dictionary
            Enrichment results

        AppendString: str, Default ""
            Custom report name, if empty then time stamp will be used

        OutputDirectory: boolean, Default None
            Path of directories where the report will be saved

    Returns:
        None

    Usage:
        ExportEnrichmentReport(goExample1, AppendString='goExample1', OutputDirectory=None)
    """

    def FlattenDataForExport(data):

        returning = {}

        if (type(data) is dict):
            if len(data)==0:
                print('The result is empty.')
                returning['List'] = data
                return 
            idata = data[list(data.keys())[0]]
            if not type(idata) is dict:
                returning['List'] = data
            elif type(idata) is dict:
                idata = idata[list(idata.keys())[0]]
                if not type(idata) is dict:
                    returning = data
                elif type(idata) is dict:
                    idata = idata[list(idata.keys())[0]]
                    if not type(idata) is dict:
                        #Loop through the clustering object
                        for keyClass in list(data.keys()):
                            for keySubClass in list(data[keyClass].keys()):
                                returning[str(keyClass)+' '+str(keySubClass)] = data[keyClass][keySubClass]
                    elif type(idata) is dict:
                        for keyAnalysisType in list(data.keys()):
                            #Loop through the clustering object
                            for keyClass in list(data[keyAnalysisType].keys()):
                                for keySubClass in list(data[keyAnalysisType][keyClass].keys()):
                                    returning[str(keyAnalysisType)+' '+str(keyClass)+' '+str(keySubClass)] = data[keyAnalysisType][keyClass][keySubClass]
        else:
            print('Results type is not supported...')

        return returning

    def ExportToFile(fileName, data):

        writer = pd.ExcelWriter(fileName)

        for key in list(data.keys()):
            keys, values = list(data[key].keys()), list(data[key].values())

            listNum = [[item for sublist in list(value)[:2] for item in sublist] for value in values]
            listNon = [list(value)[2:] for value in values]

            dataDF = [listNum[i] + listNon[i] for i in range(len(keys))]
            columns = ['p-Value', 'BH-corrected p-Value', 'Significant', 'Counts in list', 'Counts in family', 'Total members', 'Counts in members', 'Description', 'List of gene hits']

            df = pd.DataFrame(data=dataDF, index=keys, columns=columns)

            df['Significant'] = df['Significant'].map(bool)

            cleanup = lambda value: value.replace("']], ['", ' | ').replace("[", '').replace("]", '').replace("'", '').replace(", Unknown", '')
            df['List of gene hits'] = df['List of gene hits'].map(str).apply(cleanup)
            df['Description'] = df['Description'].map(str).apply(cleanup)

            df.sort_values(by='BH-corrected p-Value', inplace=True)

            df.to_excel(writer, str(key))

            writer.sheets[str(key)].set_column('A:A', df.index.astype(str).map(len).max()+2)
             
            format = writer.book.add_format({'text_wrap': False,
                                             'valign': 'top'})

            for idx, column in enumerate(df.columns):
                max_len = max((df[column].astype(str).map(len).max(),  # len of largest item
                            len(str(df[column].name)))) + 1            # len of column name/header adding a little extra space

                width = 50 if column=='Description' else min(180, max_len)

                writer.sheets[str(key)].set_column(idx+1, idx+1, width, format)  # set column width

        writer.save()

        print('Saved:', fileName)

        return None
    
    saveDir = os.path.join(os.getcwd(), "Enrichment reports") if OutputDirectory==None else OutputDirectory

    utilityFunctions.createDirectories(saveDir)

    if AppendString=="":
        AppendString=(datetime.datetime.now().isoformat().replace(' ', '_').replace(':', '_').split('.')[0])

    ExportToFile(saveDir + AppendString + '.xlsx', FlattenDataForExport(data))

    return None

[docs]def BenjaminiHochbergFDR(pValues, SignificanceLevel=0.05):

    """HypothesisTesting BenjaminiHochbergFDR correction

    Parameters:
        pValues: 1d numpy.array
            Array of p-values

        SignificanceLevel: float, Default 0.05
            Significance level

    Returns:
        dictionary
            Corrected p-Values, p- and q-Value cuttoffs

    Usage:
        result = BenjaminiHochbergFDR(pValues)
    """

    #pValues = np.round(pValues,6)
      
    #count number of hypotheses tested
    nTests = len(pValues)

    #sort the pValues in order
    sortedpVals = np.sort(pValues)

    #generate a sorting ID by ordering
    sortingIDs = np.argsort(np.argsort(pValues))

    #adjust p values to weighted p values
    weightedpVals = sortedpVals * nTests / (1 + np.arange(nTests))

    #flatten the weighted p-values to smooth out any local maxima and get adjusted p-vals
    adjustedpVals = np.array([np.min(weightedpVals[i:]) for i in range(nTests)])

    #finally,generate the qVals by reordering
    qVals = adjustedpVals[sortingIDs]

    ##create an association from which to identify correspondence between
    ##p-values and q-values#Print[{qVals,pValues}];
    pValqValAssociation = dict(zip(qVals, pValues))

    #get the cutoff adjusted q-value
    tempValues = np.flip(adjustedpVals)[np.flip(adjustedpVals) <= SignificanceLevel]
    cutoffqValue = tempValues[0] if len(tempValues) > 0 else np.nan

    #identify corresponding cutoff p-value
    if np.isnan(cutoffqValue):
        cutoffqValue = 0.
        pValCutoff = 0.
    else:
        pValCutoff = pValqValAssociation[cutoffqValue]

    #get q-vals and q-val cutoff, test the qVals for being above or below
    #significance level -- return "true" if enriched and "false" if not
    returning = {"Results": np.vstack((pValues, qVals, qVals <= cutoffqValue)),
                    "p-Value Cutoff": pValCutoff,
                    "q-Value Cutoff": cutoffqValue}

    return returning

[docs]def ReactomeAnalysis(data, 
                     uploadURL = 'https://reactome.org/AnalysisService/identifiers/projection',
                     preDownloadURL = 'https://reactome.org/AnalysisService/download/',
                     postDownloadURL = '/pathways/TOTAL/result.csv',
                     headersPOST = {'accept': 'application/json', 'content-type': 'text/plain'},
                     headersGET =  {'accept': 'text/CSV'},
                     URLparameters = (('interactors', 'false'), ('pageSize', '20'), ('page', '1'), ('sortBy', 'ENTITIES_PVALUE'), ('order', 'ASC'), ('resource', 'TOTAL'))):

    """Reactome POST-GET-style analysis.
    
    Parameters: 
        data: pd.DataFrame or list
            Data to analyze

        uploadURL: str, Default 'https://reactome.org/AnalysisService/identifiers/projection'
            URL for POST request

        preDownloadURL: str, Default 'https://reactome.org/AnalysisService/download/'
            Part 1 of URL for GET request

        postDownloadURL: str, Default '/pathways/TOTAL/result.csv'
            Part 2 of URL for GET request

        headersPOST: dict, Default {'accept': 'application/json', 'content-type': 'text/plain'}
            URL headers for POST request

        headersGET: dict, Default {'accept': 'text/CSV'}
            URL headers for GET request

        URLparameters: tuple, Default (('interactors', 'false'), ('pageSize', '20'), ('page', '1'), ('sortBy', 'ENTITIES_PVALUE'), ('order', 'ASC'), ('resource', 'TOTAL'))
            Parameters for POST request

    Returns:
        returning
            Enrichment object

    Usage:
        goExample1 = ReactomeAnalysis(["TAB1", "TNFSF13B", "MALT1", "TIRAP", "CHUK", 
                                "TNFRSF13C", "PARP1", "CSNK2A1", "CSNK2A2", "CSNK2B", "LTBR", 
                                "LYN", "MYD88", "GADD45B", "ATM", "NFKB1", "NFKB2", "NFKBIA", 
                                "IRAK4", "PIAS4", "PLAU"])
    """
    
    def internalQueryReactome(data):

        '''Function used internally

        Parameters:
            data: str, list or list-like
                Input identifiers in a form of one string, list or a list-like object

        Returns:
            pandas.DataFrame
                Reactome enrichemnt result

        Usage:
            internalQueryReactome('P01023, Q99758, O15439, O43184')
        '''

        # Check or convert data into a string of delimeter-separated values
        if type(data) is str:
            dataString = data.replace("'", "").replace("\"", "")
        else:
            if type(data) is list:
                dataString = str(data).replace("'", "").replace("\"", "").strip(']').strip('[')
            else:
                dataString = str(list(data)).replace("'", "").strip(']').strip('[')

        # POST request to uploadURL with specific 'data'
        response = requests.post(uploadURL, headers=headersPOST, params=URLparameters, data=dataString)

        # Read identifier of the POST request, to use it with GET request
        responseToken = response.json()['summary']['token']

        # GET the csv result from downloadURL using identifier from POST request
        response = requests.get(preDownloadURL + responseToken + postDownloadURL, headers=headersGET)

        # Set up a stream from GET response
        stream = io.StringIO(response.content.decode('utf-8'))
        
        # Read GET response stream into pandas DataFrame
        enrichmentDataFrame = pd.read_csv(stream, index_col=0)

        return enrichmentDataFrame

    listToggle = False

    #If the input is simply a list
    if type(data) is list:
        data = {'dummy': data}
        listToggle = True

    #The data may be a subgroup from a clustering object, i.e. a pd.DataFrame
    if type(data) is pd.DataFrame:
        id = list(data.index.get_level_values('id'))
        data = {'dummy': np.unique(id)}
        listToggle = True
    
    returning = {}

    #Check if a clustering object
    if "linkage" in data.keys():
        #Loop through the clustering object, calculate GO for each SubGroup
        for keyGroup in sorted([item for item in list(data.keys()) if not item=='linkage']):
            returning[keyGroup] = {}
            for keySubGroup in sorted([item for item in list(data[keyGroup].keys()) if not item=='linkage']):
                SubGroupMultiIndex = data[keyGroup][keySubGroup]['data'].index
                SubGroupGenes = list(SubGroupMultiIndex.get_level_values('id'))
                SubGroupList = np.unique(SubGroupGenes)

                returning[keyGroup][keySubGroup] = internalQueryReactome(SubGroupList)
                
    #The data is a dictionary of type {'Name1': [data1], 'Name2': [data2], ...}
    else:
        for key in list(data.keys()):
            returning.update({key: internalQueryReactome(data[key])})

        #If a single list was provided, return the association for Gene Ontologies
        returning = returning['dummy'] if listToggle else returning

    return returning

[docs]def ExportReactomeEnrichmentReport(data, AppendString="", OutputDirectory=None):

    """Export results from enrichment analysis to Excel spreadsheets.
    
    Parameters:
        data: dictionary or pandas.DataFrame
            Reactome pathway enrichment results

        AppendString: str, Default ""
            Custom report name, if empty then time stamp will be used

        OutputDirectory: boolean, Default None
            Path of directories where the report will be saved

    Returns:
        None

    Usage:
        ExportReactomeEnrichmentReport(example1, AppendString='example1', OutputDirectory=None)
    """

    def FlattenDataForExport(data):

        returning = {}

        if (type(data) is pd.DataFrame):
            returning['List'] = data
        elif (type(data) is dict):
            idata = data[list(data.keys())[0]]
            if type(idata) is pd.DataFrame:
                returning = data
            elif type(idata) is dict:
                idata = idata[list(idata.keys())[0]]
                if type(idata) is pd.DataFrame:
                    for keyClass in list(data.keys()):
                        for keySubClass in list(data[keyClass].keys()):
                            returning[str(keyClass)+' '+str(keySubClass)] = data[keyClass][keySubClass]
        else:
            print('Results type is not supported...')

        return returning

    def ExportToFile(fileName, data):

        writer = pd.ExcelWriter(fileName)

        for key in list(data.keys()):

            df = data[key]

            df.to_excel(writer, str(key))

            writer.sheets[str(key)].set_column('A:A', df.index.astype(str).map(len).max()+2)
             
            format = writer.book.add_format({'text_wrap': False,
                                             'valign': 'top'})

            for idx, column in enumerate(df.columns):
                max_len = max((df[column].astype(str).map(len).max(),  # len of largest item
                            len(str(df[column].name)))) + 1            # len of column name/header adding a little extra space

                width = 50 if ((column=='Pathway name') or (column=='Found reaction identifiers')) else min(180, max_len)

                writer.sheets[str(key)].set_column(idx+1, idx+1, width, format)  # set column width

        writer.save()

        print('Saved:', fileName)

        return None
    
    saveDir = os.path.join(os.getcwd(), "Enrichment reports") if OutputDirectory==None else OutputDirectory

    utilityFunctions.createDirectories(saveDir)

    if AppendString=="":
        AppendString=(datetime.datetime.now().isoformat().replace(' ', '_').replace(':', '_').split('.')[0])

    ExportToFile(os.path.join(saveDir, AppendString + '.xlsx'), FlattenDataForExport(data))

    return None