#! /usr/bin/env python """ # ----------------------------------------------------------------------- # encoding: utf-8 # augmented_DIAMOnD.py # Kevin Specht # Last Modified: 2017-04-27 # This code runs the augmented DIAMOnD algorithm # as described in Augmenting DIAMOnD: A Method for Improving # Disease Networks Among Human Genes # Based on # # A DIseAse MOdule Detection (DIAMOnD) Algorithm derived from a # systematic analysis of connectivity patterns of disease proteins in # the Human Interactome # # by Susan Dina Ghiassian, Joerg Menche & Albert-Laszlo Barabasi # # # ----------------------------------------------------------------------- """ import time import cPickle import networkx as nx import numpy as np import copy import scipy.stats from collections import defaultdict import csv import sys # ============================================================================= def print_usage(): print ' ' print ' usage: ./DIAMOnD network_file seed_file n alpha(optional) outfile_name (optional)' print ' -----------------------------------------------------------------' print ' network_edgeList_file1 : The edgelist must be provided as any delimiter-separated' print ' table. Make sure the delimiter does not exit in gene IDs' print ' and is consistent across the file.' print ' The first two columns of the table will be' print ' interpreted as an interaction gene1 <==> gene2' print ' network_edgeList_file2 : The edgelist must be provided as any delimiter-separated' print ' table. Make sure the delimiter does not exit in gene IDs' print ' and is consistent across the file.' print ' The first two columns of the table will be' print ' interpreted as an interaction gene1 <==> gene2' print ' seed_file : table containing the seed genes (if table contains' print ' more than one column they must be tab-separated;' print ' the first column will be used only)' print ' n : desired number of augmented DIAMOnD genes, 200 is a reasonable' print ' starting point.' print ' alpha : an integer representing weight of the links in multiple networks,default' print ' value is set to 1' print ' outfile_name : results will be saved under this file name' print ' by default the outfile_name is set to "first_n_added_nodes_weight_alpha.txt"' print ' ' # ============================================================================= def check_input_style(input_list): try: network_edgelist_file1 = input_list[1] network_edgelist_file2 = input_list[2] seeds_file = input_list[3] max_number_of_added_nodes = int(input_list[4]) # if no input is given, print out a usage message and exit except: print_usage() sys.exit(0) return alpha = 1 outfile_name = 'first_%d_added_nodes_weight_%d.txt'%(max_number_of_added_nodes,alpha) if len(input_list)==6: try: alpha = int(input_list[5]) outfile_name = 'first_%d_added_weight_%d.txt'%(max_number_of_added_nodes,alpha) except: outfile_name = input_list[5] if len(input_list)==7: try: alpha = int(input_list[5]) outfile_name = input_list[6] except: print_usage() sys.exit(0) return return network_edgelist_file1,network_edgelist_file2,seeds_file,max_number_of_added_nodes,alpha,outfile_name # ============================================================================= def read_input(network_file,seed_file): """ Reads the main network and the list of seed genes from external files. * The edgelist must be provided as a tab-separated table. The first two columns of the table will be interpreted as an interaction gene1 <==> gene2 * The seed genes mus be provided as a table. If the table has more than one column, they must be tab-separated. The first column will be used only. * Lines that start with '#' will be ignored in both cases """ sniffer = csv.Sniffer() line_delimiter = None for line in open(network_file,'r'): if line[0]=='#': continue else: dialect = sniffer.sniff(line) line_delimiter = dialect.delimiter break if line_delimiter == None: print 'network_file format not correct' sys.exit(0) # read network: G = nx.Graph() for line in open(network_file,'r'): # lines starting with '#' will be ignored if line[0]=='#': continue # The first two columns in the line will be interpreted as an # interaction gene1 <=> gene2 #line_data = line.strip().split('\t') line_data = line.strip().split(line_delimiter) node1 = line_data[0] node2 = line_data[1] G.add_edge(node1,node2) # read the seed genes: seed_genes = set() for line in open(seed_file,'r'): # lines starting with '#' will be ignored if line[0]=='#': continue # the first column in the line will be interpreted as a seed # gene: line_data = line.strip().split('\t') seed_gene = line_data[0] seed_genes.add(seed_gene) return G,seed_genes # ================================================================================ def compare_lists(network1, network2, seedsFile): """ Determines which links from the main network also exist in the secondary network in order to determine the weights of each link """ mainList=set() #links in main network otherList=set() #links in supplementary network seedList=set() #seed genes all_genes=set() #all eligible disease genes in main network (not seeds) main=[] #temporary main network other=[] #temporary supplementary network for line in open(network1): main.append(line.rstrip()) #get main links for line in open(network2): other.append(line.rstrip()) #get supplementary lins for line in open(seedsFile): seedList.add(line.rstrip()) #get seed genes for line in main: #get all eligible disease genes from main network genes=line.split() if genes[0] not in all_genes: if genes[0] not in seedList: all_genes.add(genes[0]) if genes[1] not in all_genes: if genes[1] not in seedList: all_genes.add(genes[1]) good_links=[] #links in both networks (weighted) #sort genes in each link of main list alphabetically for link in main: genes=link.split() newlink=sorted([genes[0],genes[1]]) str='' for i in newlink: str+=i str+='\t' mainList.add(str) #sort genes in each link of supplementary list alphabetically for link in other: genes=link.split() newlink=sorted([genes[0],genes[1]]) str='' for i in newlink: str+=i str+='\t' otherList.add(str) #find all links that appear in both lists good_links=mainList.intersection(otherList) #return list of weighted links return good_links # ================================================================================ def compute_all_gamma_ln(N): """ precomputes all logarithmic gammas """ gamma_ln = {} for i in range(1,N+1): gamma_ln[i] = scipy.special.gammaln(i) return gamma_ln # ============================================================================= def logchoose(n, k, gamma_ln): if n-k+1 <= 0: return scipy.infty lgn1 = gamma_ln[n+1] lgk1 = gamma_ln[k+1] lgnk1 = gamma_ln[n-k+1] return lgn1 - [lgnk1 + lgk1] # ============================================================================= def gauss_hypergeom(x, r, b, n, gamma_ln): return np.exp(logchoose(r, x, gamma_ln) + logchoose(b, n-x, gamma_ln) - logchoose(r+b, n, gamma_ln)) # ============================================================================= def pvalue(kb, k, N, s, gamma_ln): """ ------------------------------------------------------------------- Computes the p-value for a node that has kb out of k links to seeds, given that there's a total of s sees in a network of N nodes. p-val = \sum_{n=kb}^{k} HypergemetricPDF(n,k,N,s) ------------------------------------------------------------------- """ p = 0.0 for n in range(kb,k+1): if n > s: break prob = gauss_hypergeom(n, s, N-s, k, gamma_ln) p += prob if p > 1: return 1 else: return p # ============================================================================= def get_neighbors_and_degrees(G): neighbors,all_degrees = {},{} for node in G.nodes(): nn = set(G.neighbors(node)) neighbors[node] = nn all_degrees[node] = G.degree(node) return neighbors,all_degrees # ============================================================================= # Reduce number of calculations # ============================================================================= def reduce_not_in_cluster_nodes(all_degrees,neighbors,G,not_in_cluster,cluster_nodes,alpha,good_links): reduced_not_in_cluster = {} kb2k = defaultdict(dict) i=0 for node in not_in_cluster: k = all_degrees[node] kb = 0 # Going through all neighbors and counting the number of module neighbors for neighbor in neighbors[node]: if neighbor in cluster_nodes: kb += 1 #loop through the weighted links for link in good_links: #if either gene is the current gene in the network # add extra weight to its connections, if the other # gene is a seed gene add extra weight to seed connections genes=link.split() if node==genes[0]: k += (alpha-1) if genes[1] in cluster_nodes: kb += (alpha-1) elif node==genes[1]: k += (alpha-1) if genes[0] in cluster_nodes: kb += (alpha-1) kb2k[kb][k] =node i=i+1 # Going to choose the node with largest kb, given k k2kb = defaultdict(dict) for kb,k2node in kb2k.iteritems(): min_k = min(k2node.keys()) node = k2node[min_k] k2kb[min_k][kb] = node for k,kb2node in k2kb.iteritems(): max_kb = max(kb2node.keys()) node = kb2node[max_kb] reduced_not_in_cluster[node] =(max_kb,k) return reduced_not_in_cluster #====================================================================================== # C O R E A L G O R I T H M #====================================================================================== def diamond_iteration_of_first_X_nodes(G,S,X,alpha,good_links): """ Parameters: ---------- - G: graph - S: seeds - X: the number of iterations, i.e only the first X genes will be pulled in - alpha: seeds weight - good_links: set of weighted links in the main network Returns: -------- - added_nodes: ordered list of nodes in the order by which they are agglomerated. Each entry has 4 info: * name : dito * k : degree of the node * kb : number of +1 neighbors * p : p-value at agglomeration """ N = G.number_of_nodes() added_nodes = [] # ------------------------------------------------------------------ # Setting up dictionaries with all neighbor lists # and all degrees # ------------------------------------------------------------------ neighbors,all_degrees = get_neighbors_and_degrees(G) # ------------------------------------------------------------------ # Setting up initial set of nodes in cluster # ------------------------------------------------------------------ cluster_nodes = set(S) not_in_cluster = set() s0 = len(cluster_nodes) #loop through the weighted links for link in good_links: genes=link.split() #if either gene is a seed gene add extra weight to seed genes if genes[0] in cluster_nodes: s0 += (alpha-1) if genes[1] in cluster_nodes: s0 += (alpha-1) #add extra weight to total genes N +=(alpha-1) # ------------------------------------------------------------------ # precompute the logarithmic gamma functions # ------------------------------------------------------------------ gamma_ln = compute_all_gamma_ln(N+1) # ------------------------------------------------------------------ # Setting initial set of nodes not in cluster # (non-seed nodes connected to seed nodes) # ------------------------------------------------------------------ for node in cluster_nodes: not_in_cluster |= neighbors[node] not_in_cluster -= cluster_nodes # ------------------------------------------------------------------ # # M A I N L O O P # # ------------------------------------------------------------------ all_p = {} while len(added_nodes) < X: # ------------------------------------------------------------------ # # Going through all nodes that are not in the cluster yet and # record k, kb and p # # ------------------------------------------------------------------ info = {} pmin = 10 next_node = 'nix' reduced_not_in_cluster = reduce_not_in_cluster_nodes(all_degrees, neighbors,G, not_in_cluster, cluster_nodes,alpha,good_links) for node,kbk in reduced_not_in_cluster.iteritems(): # Getting the p-value of this kb,k # combination and save it in all_p, so computing it only once! #print(node) kb,k = kbk try: p = all_p[(k,kb,s0)] except KeyError: p = pvalue(kb, k, N, s0, gamma_ln) all_p[(k,kb,s0)] = p # recording the node with smallest p-value if p < pmin: pmin = p next_node = node info[node] = (k,kb,p) # --------------------------------------------------------------------- # Adding node with smallest p-value to the list of aaglomerated nodes # --------------------------------------------------------------------- added_nodes.append((next_node, info[next_node][0], info[next_node][1], info[next_node][2])) # Updating the list of cluster nodes and s0 cluster_nodes.add(next_node) s0 = len(cluster_nodes) not_in_cluster |= ( neighbors[next_node] - cluster_nodes ) not_in_cluster.remove(next_node) return added_nodes # =========================================================================== # # M A I N D I A M O n D A L G O R I T H M # # =========================================================================== def augDIAMOnD(G_original,seed_genes,max_number_of_added_nodes,alpha,good_links,outfile = None): """ Runs the augmented DIAMOnD algorithm Input: ------ - G_original : The network - seed_genes : a set of seed genes - max_number_of_added_nodes: after how many added nodes should the algorithm stop - alpha: given weight to the sees - good_links: set of weighted links in the main network - outfile: filename for the output generates by the algorithm, if not given the program will name it 'first_x_added_nodes.txt' Returns: -------- - added_nodes: A list with 4 entries at each element: * name : name of the node * k : degree of the node * kb : number of neighbors that are part of the module (at agglomeration) * p : connectivity p-value at agglomeration - """ # 1. throwing away the seed genes that are not in the network all_genes_in_network = set(G_original.nodes()) seed_genes = set(seed_genes) disease_genes = seed_genes & all_genes_in_network if len(disease_genes) != len(seed_genes): print "augDIAMOnD(): ignoring %s of %s seed genes that are not in the network" %( len(seed_genes - all_genes_in_network), len(seed_genes)) # 2. agglomeration algorithm. added_nodes = diamond_iteration_of_first_X_nodes(G_original, disease_genes, max_number_of_added_nodes,alpha,good_links) # 3. saving the results with open(outfile,'w') as fout: print>>fout,'\t'.join(['#rank','DIAMOnD_node']) rank = 0 for DIAMOnD_node_info in added_nodes: rank += 1 DIAMOnD_node = DIAMOnD_node_info[0] p = float(DIAMOnD_node_info[3]) print>>fout,'\t'.join(map(str,([rank,DIAMOnD_node]))) return added_nodes # =========================================================================== # # "Hey Ho, Let's go!" -- The Ramones (1976) # # =========================================================================== if __name__ == '__main__': # ----------------------------------------------------- # Checking for input from the command line: # ----------------------------------------------------- # # [1] file providing the main network in the form of an edgelist # (tab-separated table, columns 1 & 2 will be used) # # [2] file providing the secondary network in the form of an edgelist # (tab-separated table, columns 1 & 2 will be used) # # [3] file with the seed genes (if table contains more than one # column they must be tab-separated; the first column will be # used only) # # [4] number of desired iterations # # [5] (optional) weight of links in multiple networks (integer), default value is 1 # [6] (optional) name for the results file #check if input style is correct input_list = sys.argv network_edgelist_file1,network_edgelist_file2,seeds_file,max_number_of_added_nodes,alpha,outfile_name= check_input_style(input_list) # read the networks and the seed genes: G_original,seed_genes = read_input(network_edgelist_file1,seeds_file) good_links=compare_lists(network_edgelist_file1, network_edgelist_file2, seeds_file) # run augmented DIAMOnD added_nodes = augDIAMOnD(G_original, seed_genes, max_number_of_added_nodes,alpha,good_links, outfile=outfile_name) print "\n results have been saved to '%s' \n" %outfile_name