#! /usr/bin/env python """ # ----------------------------------------------------------------------- # encoding: utf-8 # parse_annotations.py # Kevin Specht # Last Modified: 2017-04-27 # This code takes a set of GO terms associated with a set of seed genes # for a partiular disease and a set of potential disease genes obtained # by an algorithm and determines which of them belong to the disease module # ----------------------------------------------------------------------- """ # ----------------------------------------------------- # Checking for input from the command line: # ----------------------------------------------------- # # [1] file with the seed GO terms (if table contains more than one # column they must be tab-separated; the first column will be # used only) # # [2] file with the potential disease genes from the algorithm # (if table contains more than one column they must be # tab-separated; the first column will be used only) # # [3] name for the results file import sys import time import copy import csv """ Gets the list of GO terms associated with seed genes """ def get_annotations(): f = open(sys.argv[1],'r') termList=[] for line in f: termList.append(line.rstrip()) return termList """ Gets the list of potential disease genes obtained from the algorithm """ def get_genes(): f = open(sys.argv[2],'r') geneList=[] for line in f: geneList.append(line.rstrip()) return geneList """ Parses the complete list of gene annotations to determine which of the potential disease genes is associated with any of the terms that the seed genes are associated with """ def parse_annotations(termList, geneList): f = open("goa_human.txt",'r') list=[] #list of true positives for gene in geneList: #loop through every potential gene for line in f: #parse every line in gene annotations if gene in line: #if the potential gene is on the line for term in termList: if term in line: #if the gene is associated with one of the seed terms if gene not in list: #if the gene is not yet in the true positive list list.append(gene) #add gene to true positive list f.seek(0) #return to the beginning of the list of gene annotations return list """ Main program """ f=open(sys.argv[3],'w') #write to outfile j=1 while(j<=len(get_genes())): #perform a validation for every iteration of the algorithm s=parse_annotations(get_annotations(), get_genes()[:j]) #perform GO test f.write(str(len(s))) #record number of true positives for this iteration print(str(len(s))) f.write("\n") j=j+1 print("Check file") #loop until iterations are done f.close()