#! /usr/bin/env python


"""
# -----------------------------------------------------------------------
# encoding: utf-8

# augmented_DIAMOnD.py
# Kevin Specht
# Last Modified: 2017-04-27

# This code runs the augmented DIAMOnD algorithm
#  as described in Augmenting DIAMOnD: A Method for Improving
#  Disease Networks Among Human Genes

# Based on
# 
# A DIseAse MOdule Detection (DIAMOnD) Algorithm derived from a
# systematic analysis of connectivity patterns of disease proteins in
# the Human Interactome
#
# by Susan Dina Ghiassian, Joerg Menche & Albert-Laszlo Barabasi
# 
# 
# -----------------------------------------------------------------------
"""

import time
import cPickle
import networkx as nx
import numpy as np
import copy
import scipy.stats
from collections import defaultdict
import csv
import sys


# =============================================================================
def print_usage():
   
    print ' '
    print '        usage: ./DIAMOnD network_file seed_file n alpha(optional) outfile_name (optional)'
    print '        -----------------------------------------------------------------'
    print '        network_edgeList_file1 : The edgelist must be provided as any delimiter-separated'
    print '                       table. Make sure the delimiter does not exit in gene IDs' 
    print '                       and is consistent across the file.' 
    print '                       The first two columns of the table will be'
    print '                       interpreted as an interaction gene1 <==> gene2'
    print '        network_edgeList_file2 : The edgelist must be provided as any delimiter-separated'
    print '                       table. Make sure the delimiter does not exit in gene IDs' 
    print '                       and is consistent across the file.' 
    print '                       The first two columns of the table will be'
    print '                       interpreted as an interaction gene1 <==> gene2'
    print '        seed_file    : table containing the seed genes (if table contains'
    print '                       more than one column they must be tab-separated;'
    print '                       the first column will be used only)'
    print '        n            : desired number of augmented DIAMOnD genes, 200 is a reasonable'
    print '                       starting point.'
    print '        alpha        : an integer representing weight of the links in multiple networks,default'
    print '                       value is set to 1'
    print '        outfile_name : results will be saved under this file name'
    print '                       by default the outfile_name is set to "first_n_added_nodes_weight_alpha.txt"'
    print ' '


# =============================================================================
def check_input_style(input_list):
    try:
        network_edgelist_file1 = input_list[1]
        network_edgelist_file2 = input_list[2]
        seeds_file = input_list[3]
        max_number_of_added_nodes = int(input_list[4])
    # if no input is given, print out a usage message and exit
    except:
        print_usage()
        sys.exit(0)
        return 
    
    alpha = 1
    outfile_name = 'first_%d_added_nodes_weight_%d.txt'%(max_number_of_added_nodes,alpha)

    if len(input_list)==6:
        try:
            alpha = int(input_list[5])
            outfile_name = 'first_%d_added_weight_%d.txt'%(max_number_of_added_nodes,alpha)
        except:
            outfile_name = input_list[5] 
    
    if len(input_list)==7:    
        try:
            alpha = int(input_list[5])
            outfile_name = input_list[6]
        except:
            print_usage()       
            sys.exit(0)
            return
    return network_edgelist_file1,network_edgelist_file2,seeds_file,max_number_of_added_nodes,alpha,outfile_name

# =============================================================================
def read_input(network_file,seed_file):
    """
    Reads the main network and the list of seed genes from external files.

    * The edgelist must be provided as a tab-separated table. The
    first two columns of the table will be interpreted as an
    interaction gene1 <==> gene2

    * The seed genes mus be provided as a table. If the table has more
    than one column, they must be tab-separated. The first column will
    be used only.

    * Lines that start with '#' will be ignored in both cases
    """

    sniffer = csv.Sniffer()
    line_delimiter = None
    for line in open(network_file,'r'):
        if line[0]=='#':
            continue
        else:
            dialect = sniffer.sniff(line)
            line_delimiter = dialect.delimiter
            break
    if line_delimiter == None:
        print 'network_file format not correct'
        sys.exit(0)


    # read network:
    G = nx.Graph()
    for line in open(network_file,'r'):
        # lines starting with '#' will be ignored
        if line[0]=='#':
            continue
        # The first two columns in the line will be interpreted as an
        # interaction gene1 <=> gene2
        #line_data   = line.strip().split('\t')
        line_data = line.strip().split(line_delimiter)
        node1 = line_data[0]
        node2 = line_data[1]
        G.add_edge(node1,node2)

    # read the seed genes:
    seed_genes = set()
    for line in open(seed_file,'r'):
        # lines starting with '#' will be ignored
        if line[0]=='#':
            continue
        # the first column in the line will be interpreted as a seed
        # gene:
        line_data = line.strip().split('\t')
        seed_gene = line_data[0]
        seed_genes.add(seed_gene)

    return G,seed_genes

# ================================================================================
def compare_lists(network1, network2, seedsFile):
    """
    Determines which links from the main network also exist in the
    secondary network in order to determine the weights of each link
    """
    
    mainList=set() #links in main network
    otherList=set() #links in supplementary network
    seedList=set() #seed genes
    all_genes=set() #all eligible disease genes in main network (not seeds)

    main=[] #temporary main network
    other=[] #temporary supplementary network
    for line in open(network1):
        main.append(line.rstrip()) #get main links
    for line in open(network2):
        other.append(line.rstrip()) #get supplementary lins
    for line in open(seedsFile):
        seedList.add(line.rstrip()) #get seed genes
    for line in main: #get all eligible disease genes from main network
        genes=line.split()
        if genes[0] not in all_genes:
            if genes[0] not in seedList:
                all_genes.add(genes[0])
        if genes[1] not in all_genes:
            if genes[1] not in seedList:
                all_genes.add(genes[1])

    good_links=[] #links in both networks (weighted)

    #sort genes in each link of main list alphabetically
    for link in main:
        genes=link.split()
        newlink=sorted([genes[0],genes[1]])
        str=''
        for i in newlink:
            str+=i
            str+='\t'
        mainList.add(str)

    #sort genes in each link of supplementary list alphabetically
    for link in other:
        genes=link.split()
        newlink=sorted([genes[0],genes[1]])
        str=''
        for i in newlink:
            str+=i
            str+='\t'
        otherList.add(str)

    #find all links that appear in both lists
    good_links=mainList.intersection(otherList)

    #return list of weighted links
    return good_links
        
# ================================================================================
def compute_all_gamma_ln(N):
    """
    precomputes all logarithmic gammas 
    """
    gamma_ln = {}
    for i in range(1,N+1):
        gamma_ln[i] = scipy.special.gammaln(i)

    return gamma_ln

# =============================================================================
def logchoose(n, k, gamma_ln):
    if n-k+1 <= 0:
        return scipy.infty
    lgn1  = gamma_ln[n+1]                                                      
    lgk1  = gamma_ln[k+1]                                                      
    lgnk1 = gamma_ln[n-k+1]                                                    
    return lgn1 - [lgnk1 + lgk1]                                               

# =============================================================================
def gauss_hypergeom(x, r, b, n, gamma_ln):                                     
    return np.exp(logchoose(r, x, gamma_ln) +
                  logchoose(b, n-x, gamma_ln) -
                  logchoose(r+b, n, gamma_ln))     
                                                                               
# =============================================================================
def pvalue(kb, k, N, s, gamma_ln):                                        
    """                                                                        
    -------------------------------------------------------------------        
    Computes the p-value for a node that has kb out of k links to              
    seeds, given that there's a total of s sees in a network of N nodes.       
                                                                               
    p-val = \sum_{n=kb}^{k} HypergemetricPDF(n,k,N,s)                          
    -------------------------------------------------------------------        
    """                                                                        
    p = 0.0                                                                    
    for n in range(kb,k+1):                                                    
        if n > s:                                                              
            break                                                              
        prob = gauss_hypergeom(n, s, N-s, k, gamma_ln)                                                                                   
        p += prob                                                              
                                                                               
    if p > 1:                                                                  
        return 1                                                               
    else:                                                                      
        return p                                                               

# =============================================================================
def get_neighbors_and_degrees(G):

    neighbors,all_degrees = {},{}
    for node in G.nodes():
        nn = set(G.neighbors(node))
        neighbors[node] = nn
        all_degrees[node] = G.degree(node)

    return neighbors,all_degrees

# =============================================================================
# Reduce number of calculations
# =============================================================================
def reduce_not_in_cluster_nodes(all_degrees,neighbors,G,not_in_cluster,cluster_nodes,alpha,good_links): 
    reduced_not_in_cluster = {}                                                        
    kb2k = defaultdict(dict)
    i=0
    for node in not_in_cluster:                                                        
        
        k = all_degrees[node]                                                          
        kb = 0                                                                         
        # Going through all neighbors and counting the number of module neighbors        
        for neighbor in neighbors[node]:                                               
            if neighbor in cluster_nodes:                                              
                kb += 1
        
        #loop through the weighted links
        for link in good_links:
            #if either gene is the current gene in the network
            #  add extra weight to its connections, if the other
            #  gene is a seed gene add extra weight to seed connections
            genes=link.split()
            if node==genes[0]:
                k += (alpha-1)
                if genes[1] in cluster_nodes:
                    kb += (alpha-1)
            elif node==genes[1]:
                k += (alpha-1)
                if genes[0] in cluster_nodes:
                    kb += (alpha-1)
        kb2k[kb][k] =node
        i=i+1
    # Going to choose the node with largest kb, given k                                
    k2kb = defaultdict(dict)                                                           
    for kb,k2node in kb2k.iteritems():                                                 
        min_k = min(k2node.keys())                                                     
        node = k2node[min_k]                                                           
        k2kb[min_k][kb] = node                                                         
                                                                                       
    for k,kb2node in k2kb.iteritems():                                                 
        max_kb = max(kb2node.keys())                                                   
        node = kb2node[max_kb]                                                         
        reduced_not_in_cluster[node] =(max_kb,k)                                       
                                                                                       
    return reduced_not_in_cluster                                                      

#======================================================================================
#   C O R E    A L G O R I T H M
#======================================================================================
def diamond_iteration_of_first_X_nodes(G,S,X,alpha,good_links):
    
    """

    Parameters:                                                                     
    ----------                                                                      
    - G:     graph
    - S:     seeds 
    - X:     the number of iterations, i.e only the first X genes will be
             pulled in
    - alpha: seeds weight
    - good_links: set of weighted links in the main network

    Returns:                                                                        
    --------
    
    - added_nodes: ordered list of nodes in the order by which they
      are agglomerated. Each entry has 4 info:

      * name : dito
      * k    : degree of the node
      * kb   : number of +1 neighbors
      * p    : p-value at agglomeration

    """
    
    N = G.number_of_nodes()
    added_nodes = []


    # ------------------------------------------------------------------
    # Setting up dictionaries with all neighbor lists
    # and all degrees
    # ------------------------------------------------------------------
    neighbors,all_degrees = get_neighbors_and_degrees(G)

    # ------------------------------------------------------------------
    # Setting up initial set of nodes in cluster
    # ------------------------------------------------------------------
    
    cluster_nodes = set(S)
    not_in_cluster = set()
    s0 = len(cluster_nodes)

    #loop through the weighted links
    for link in good_links:
        genes=link.split()
        #if either gene is a seed gene add extra weight to seed genes
        if genes[0] in cluster_nodes:
            s0 += (alpha-1)
        if genes[1] in cluster_nodes:
            s0 += (alpha-1)
        #add extra weight to total genes
        N +=(alpha-1)
    
    # ------------------------------------------------------------------
    # precompute the logarithmic gamma functions
    # ------------------------------------------------------------------
    gamma_ln = compute_all_gamma_ln(N+1)
    
    # ------------------------------------------------------------------
    # Setting initial set of nodes not in cluster
    #   (non-seed nodes connected to seed nodes)
    # ------------------------------------------------------------------
    for node in cluster_nodes:
        not_in_cluster |= neighbors[node]
    not_in_cluster -= cluster_nodes

    # ------------------------------------------------------------------
    #
    # M A I N     L O O P 
    #
    # ------------------------------------------------------------------

    all_p = {}

    while len(added_nodes) < X:    

        # ------------------------------------------------------------------
        #
        # Going through all nodes that are not in the cluster yet and
        # record k, kb and p 
        #
        # ------------------------------------------------------------------
        
        info = {}
    
        pmin = 10
        next_node = 'nix'
        reduced_not_in_cluster = reduce_not_in_cluster_nodes(all_degrees,
                                                             neighbors,G,
                                                             not_in_cluster,
                                                             cluster_nodes,alpha,good_links)
        
        for node,kbk in reduced_not_in_cluster.iteritems():
            # Getting the p-value of this kb,k
            # combination and save it in all_p, so computing it only once!
            #print(node)
            kb,k = kbk
            try:
                p = all_p[(k,kb,s0)]
            except KeyError:
                p = pvalue(kb, k, N, s0, gamma_ln)
                all_p[(k,kb,s0)] = p
             
            # recording the node with smallest p-value
            if p < pmin:
                pmin = p
                next_node = node
    
            info[node] = (k,kb,p)

        # ---------------------------------------------------------------------
        # Adding node with smallest p-value to the list of aaglomerated nodes
        # ---------------------------------------------------------------------
        added_nodes.append((next_node,
                            info[next_node][0],
                            info[next_node][1],
                            info[next_node][2]))

        # Updating the list of cluster nodes and s0
        cluster_nodes.add(next_node)
        s0 = len(cluster_nodes)
        not_in_cluster |= ( neighbors[next_node] - cluster_nodes )
        not_in_cluster.remove(next_node)

    return added_nodes

# ===========================================================================
#
#   M A I N    D I A M O n D    A L G O R I T H M
# 
# ===========================================================================
def augDIAMOnD(G_original,seed_genes,max_number_of_added_nodes,alpha,good_links,outfile = None):

    """
    Runs the augmented DIAMOnD algorithm

    Input:
    ------
     - G_original :
             The network
     - seed_genes : 
             a set of seed genes 
     - max_number_of_added_nodes:
             after how many added nodes should the algorithm stop
     - alpha:
             given weight to the sees
     - good_links:
             set of weighted links in the main network
     - outfile:
             filename for the output generates by the algorithm,
             if not given the program will name it 'first_x_added_nodes.txt'

     Returns:
     --------
      - added_nodes: A list with 4 entries at each element:
            * name : name of the node
            * k    : degree of the node
            * kb   : number of neighbors that are part of the module (at agglomeration)
            * p    : connectivity p-value at agglomeration
      -           
    """
    
    # 1. throwing away the seed genes that are not in the network
    all_genes_in_network = set(G_original.nodes())
    seed_genes = set(seed_genes)
    disease_genes = seed_genes & all_genes_in_network

    if len(disease_genes) != len(seed_genes):
        print "augDIAMOnD(): ignoring %s of %s seed genes that are not in the network" %(
            len(seed_genes - all_genes_in_network), len(seed_genes))
   

    # 2. agglomeration algorithm. 
    added_nodes = diamond_iteration_of_first_X_nodes(G_original,
                                                     disease_genes,
                                                     max_number_of_added_nodes,alpha,good_links)
    # 3. saving the results 
    with open(outfile,'w') as fout:
        print>>fout,'\t'.join(['#rank','DIAMOnD_node'])
        rank = 0
        for DIAMOnD_node_info in added_nodes:
            rank += 1
            DIAMOnD_node = DIAMOnD_node_info[0]
            p = float(DIAMOnD_node_info[3])
            print>>fout,'\t'.join(map(str,([rank,DIAMOnD_node])))

    return added_nodes


# ===========================================================================
#
# "Hey Ho, Let's go!" -- The Ramones (1976)
#
# ===========================================================================


if __name__ == '__main__':


    # -----------------------------------------------------
    # Checking for input from the command line:
    # -----------------------------------------------------
    #
    # [1] file providing the main network in the form of an edgelist
    #     (tab-separated table, columns 1 & 2 will be used)
    #
    # [2] file providing the secondary network in the form of an edgelist
    #     (tab-separated table, columns 1 & 2 will be used)
    #
    # [3] file with the seed genes (if table contains more than one
    #     column they must be tab-separated; the first column will be
    #     used only)
    #
    # [4] number of desired iterations
    #
    # [5] (optional) weight of links in multiple networks (integer), default value is 1 
    # [6] (optional) name for the results file 

    #check if input style is correct
    input_list = sys.argv
    network_edgelist_file1,network_edgelist_file2,seeds_file,max_number_of_added_nodes,alpha,outfile_name= check_input_style(input_list)

    # read the networks and the seed genes:
    G_original,seed_genes = read_input(network_edgelist_file1,seeds_file)

    good_links=compare_lists(network_edgelist_file1, network_edgelist_file2, seeds_file)
    
    # run augmented DIAMOnD
    added_nodes = augDIAMOnD(G_original,
                          seed_genes,
                          max_number_of_added_nodes,alpha,good_links,
                          outfile=outfile_name)
    
    print "\n results have been saved to '%s' \n" %outfile_name