source: trunk/brs2010p35/src/jag/common.py @ 2

Last change on this file since 2 was 2, checked in by tim.te.beek@…, 8 years ago

Add revision 338 snapshot of previous SVN instance, omitting PLINK binaries

File size: 11.7 KB
Line 
1'''
2Created on Dec 13, 2010
3
4@author: maartenk, esther lips
5
6stores all functions which are called from multiple classes
7'''
8
9import logging as log
10import array
11import glob
12import math
13import os.path
14import random
15import sys
16import tempfile
17import re
18import time
19
20   
21
22def plink_results_extention(resultfile):
23    """
24    """
25    files_create_by_plink = glob.glob(resultfile + "*")
26    return _plink_results_extention(files_create_by_plink)
27
28
29def _plink_results_extention(list_of_files):
30    """
31    return extension of results from plink
32    This is handy when the extension deviates from ".assoc"
33    for instance when --covar option is used
34    """
35       
36    shortened_list = []
37    for plink_out in list_of_files:
38        if not plink_out.endswith("adjusted"):
39            if not plink_out.endswith("log"):
40                shortened_list.append(plink_out)
41   
42    regex = re.compile("P[0-9]+\.(?P<extention>[A-z\.]+)$", re.MULTILINE)
43    extentions = set(regex.findall("\n".join(shortened_list)))
44   
45    if len(extentions) == 1:
46        extention_str = extentions.pop()
47       
48    elif len(extentions) == 0:
49        regex = re.compile("\.(?P<extention>[A-z\.]+)$", re.MULTILINE)
50        extentions = set(regex.findall("\n".join(list_of_files)))
51        extention_str = extentions.pop()
52       
53    else:
54        log.info(str(extentions))
55        raise IOError(len(extentions), str(extentions) + str(list_of_files))
56
57    return(extention_str)
58
59
60def check_bim_bed_fam(path):
61    """
62    check fam , bim and bed file is present
63    """
64
65    if path[-4:] in [".fam", ".bim", ".bed"]:     #check path ends with ".fam",".bim",".bed" and remove this extension
66        path = path[:-4]
67
68    getfile_handle(path + ".fam" , "This is set is --bfile variable", False)
69    getfile_handle(path + ".bed" , "This is set is --bfile variable", False)
70    getfile_handle(path + ".bim" , "This is set is --bfile variable", False)
71
72    return path
73
74
75def getfile_handle(path, message="", verbose=True):
76    """
77    return a filehandle of path or quit
78    """
79   
80    if not path:
81        log.info(message + '\nPath not defined for ')
82        if verbose:
83            raise IOError()
84        else:
85            endtime = time.asctime(time.localtime(time.time()))
86            sys.exit("\nTerminated analysis: " + endtime + "\n")
87    try:
88        filehandle = open(path, 'rU')
89        if(os.path.getsize(path) == 0):
90            log.info("\nWarning: file size of " + path + " is zero")
91        return filehandle
92   
93    except IOError, error:
94        log.info("\n" + message + "\nCould not open " + path)
95        if verbose:
96            raise error
97        else:
98            endtime = time.asctime(time.localtime(time.time()))
99            sys.exit("\nTerminated analysis: " + endtime + "\n")
100
101
102def remove_plink_output(resultfile):
103    """
104    removes all files created by plink.
105    should function the same as bash "rm resultsfile*":
106    for instance "rm /tmp/rndvar*"
107    """
108    files_create_by_plink = glob.glob(resultfile + "*")
109    for file_to_delete in files_create_by_plink:
110        os.remove(file_to_delete)
111
112
113def average(numbers):
114    """
115    calculate average (mean) of a list of numbers
116    """
117    numbers = array.array('d', numbers)
118
119    return (sum(numbers) / len(numbers))
120
121
122def variance(numbers):
123    """
124     calculate sample variance of a list of numbers.
125     equal to R-projects var() implementation
126    """
127    #function needs at least 1 number to work
128    if not isinstance(numbers, list):
129        raise AssertionError
130
131    numbers = array.array('d', numbers)
132
133    if(len(numbers) < 2):
134        return "NA"
135    else:
136        numbers = [float(x) for x in numbers]
137        mean = average(numbers)
138        diffrence_from_mean = [a - b for a, b in zip(numbers, [mean] * len(numbers))]
139        sum_of_diff = sum([a ** b for a, b in zip(diffrence_from_mean, [2] * len(numbers))])
140        var = ((1 / (float(len(numbers)) - 1)) * sum_of_diff)
141
142    return var
143
144
145def write_text_to_tempfile(text_file):
146    """
147    writes a file to temporary file (in linux to /tmp/) and returns the filename
148    """
149    filename = tempfile.mkstemp()
150    file_handle = open(filename[1], "w")
151    # Write all the lines at once:
152    file_handle.writelines(text_file)
153    file_handle.close()
154    return filename[1]
155
156
157def extract_pheno_from_pheno_file(pheno_path, pheno_available):
158    """
159    extract pheno type from a pheno_file and include only the columns which are present in pheno_available
160    as pheno_perm_clean and the start of the lines as array
161     
162    """
163    pheno_arrays = read_pheno_file(pheno_path)
164    pheno_file_list = pheno_arrays[0] #this are the IDs
165    pheno_available_as_index = [x - 1 for x in pheno_available]
166    pheno_available_as_index.sort()
167    pheno_perm = [x.split("\t") for x in pheno_arrays[1]]
168
169    if(len(pheno_perm[0]) != len(pheno_available_as_index)):
170        log.info("WARNING: Amount of sumlog files (" + str(len(pheno_available_as_index)) \
171              + ") is not the same as amount of phenotypes (" + \
172              str(len(pheno_perm[0])) + \
173              ") defined in phenotype file")
174
175    pheno_perm_list_clean = [[y[w] for w in pheno_available_as_index] for y in pheno_perm]
176    pheno_perm_clean = ["\t".join(q) for q in pheno_perm_list_clean]
177
178    return  pheno_perm_clean, pheno_file_list
179
180
181def extract_pheno_from_fam_file(path_fam_bed_bim):
182    """
183    extract pheno type from a fam (sixth column and further ) as pheno_perm_clean and
184    the start of the lines(first two columns) as array
185    """
186    fam_fh = getfile_handle(path_fam_bed_bim + ".fam")
187 
188    pheno_perm_clean, pheno_file_list = split_fh_in_columns(fam_fh, 2, 5)
189    return pheno_perm_clean, pheno_file_list
190
191
192def split_fh_in_columns(covar_fh, split, split_2=None):
193    """
194    function to split a filehandle in a column orientation manner
195    returns a tuple of 2 lists of strings
196    The character to split on is tab
197    the split parameter is a integer to pinpoint the split
198    columns in the middle can be omited by using  spit_2     
199    """
200    if split_2 == None:
201        split_2 = split
202
203    covar_tuple = [(t[:split], t[split_2:]) for t in [x.strip().split() for x in covar_fh]]
204    covar_perm_clean = [" ".join(x[1]) for x in covar_tuple]
205    covar_file_list = [" ".join(x[0]) for x in covar_tuple]
206
207    return covar_perm_clean, covar_file_list
208
209
210def extract_pheno_from_covar_file(covar_file):
211    """
212    extract pheno type from a covar as pheno_perm_clean and
213    the start of the lines(first two columns) as array
214    """
215    covar_fh = getfile_handle(covar_file)
216    covar_perm_clean, covar_file_list = split_fh_in_columns(covar_fh, 2)
217    return covar_perm_clean, covar_file_list
218
219
220
221def make_shuffled_phenotype(seeds, pheno_perm_clean, start_of_lines):
222    """
223    """
224    for i in range(len(seeds)):
225        random.seed(seeds[i])
226        random.shuffle(pheno_perm_clean)
227        for j in range(len(start_of_lines)):
228            start_of_lines[j] = start_of_lines[j] + " " + str(pheno_perm_clean[j])
229
230    perm_text_file = ""
231   
232    for i in range(len(start_of_lines)):
233        perm_text_file = perm_text_file + start_of_lines[i] + "\n"
234
235    filename = write_text_to_tempfile(perm_text_file)
236    return filename
237
238
239def create_pheno_permutation_file(plink, seeds):
240    """
241    create permuted pheno file where length of the list seeds is the amount of permutations.
242    pheno available is a list of colum numbers that must be permuted (same as in clusterresults)
243    if no pheno file is present the 6th column of the map file will be used
244    """
245   
246    pheno_path = plink.pheno_file
247    path_fam_bed_bim = plink.bfile
248    pheno_available = plink.phenotypes_present
249   
250    if (not pheno_path == None):
251        pheno_perm_clean, start_of_lines = extract_pheno_from_pheno_file(pheno_path, pheno_available)         #extract from pheno file
252    else:
253        # get 6th column(and latter) as clean phenotype
254        pheno_perm_clean, start_of_lines = extract_pheno_from_fam_file(path_fam_bed_bim)
255   
256    filename = make_shuffled_phenotype(seeds, pheno_perm_clean, start_of_lines)
257
258    return(filename)
259
260
261def create_covar_permutation_file(plink, seeds):
262    """
263    """
264    covar_perm_clean, start_of_lines = extract_pheno_from_covar_file(plink.covar_file)
265    filename = make_shuffled_phenotype(seeds, covar_perm_clean, start_of_lines)
266    return (filename)
267
268
269def save_text_file(filename, text):
270    """ saves a string to a filename
271    If the file already exists the text will be printed and is not saved
272    """
273    file_handle = open(filename, "w")
274    file_handle.write(text)
275    file_handle.close()
276
277
278def read_pheno_file(path):
279    """
280    read a pheno file from path and return first 2 columns as list
281    and the remaining columns as list
282    """
283
284    file_handle = getfile_handle(path)
285
286    start_of_lines = []  #read the cluster of genes file
287    phenotypes = []
288   
289    for line in file_handle:
290        splitted = line.split()
291        start_of_lines.append(splitted[0] + " " + splitted[1])
292        phenotypes.append("\t".join([str(q) for q in splitted[2: ]]))
293
294    return([start_of_lines, phenotypes])
295
296
297def map_snp_to_group(group, gene_group=False):
298    """
299    read the mapping from snp id to gene and clusternumber
300     a  mapping file has 3 columns:
301    column 1:snp id (rs number)
302    column 2: gene name
303    column 3:functional group
304   
305    the output is a dict with primary key the snp name
306    each value in the primary dict consist out of a dict with a gene (key="g")
307     and a cluster number (key="c")
308    """
309   
310    if len(group) == 0:
311        sys.exit("no value for group file selected")
312   
313    file_handle = getfile_handle(group)
314    snptogroup = {}
315    #read the cluster of genes file
316    for line in file_handle:
317        splitted = line.split()
318        #assign variables to descriptive variable name
319        if (len(splitted) > 2):
320            snp = splitted[0]
321            gene = splitted[1]
322            cluster = str(splitted[2])
323        elif(len(splitted) == 2):
324            snp = splitted[0]
325            cluster = splitted[1]
326        else:
327            if len(line.strip()) == 0: # if blank lines
328                pass
329            else:
330                log.info("group file has only one 1 column: this should be at least three with on :\
331                \ncolumn 1 the SNP \ncolumn 2 the geneid\
332                \ncolumn 3 functional group(without spaces in the name)")
333                log.info("the following line is wrong: " + line)
334                sys.exit(1)
335
336        if (not snp in snptogroup):
337            snptogroup[snp] = []
338        if not gene_group:
339            snptogroup[snp].append({"g":gene, "c":cluster})
340        else:
341            snptogroup[snp].append({"g":gene, "c":gene})
342
343    return  snptogroup
344
345
346def map_group_to_snp(group):
347    """
348    read the mapping from snp id to gene and clusternumber
349     a  mapping file has 3 columns:
350    column 1:snp id (rs number)
351    column 2: gene name
352    column 3:functional group
353   
354    the output is a dict with primary key the snp name
355    each value in the primary dict consist out of a dict with a gene (key="g")
356     and a cluster number (key="c")
357    """
358    if len(group) == 0:
359        sys.exit("no value for group file selected")
360    file_handle = getfile_handle(group)
361    snptogroup = {}
362    #read the cluster of genes file
363   
364    for line in file_handle:
365        splited = line.split()
366        #assign variables to descriptive variable name
367        snp = splited[0]
368        gene = splited[1]
369        cluster = splited[2]
370
371        if (not cluster in snptogroup):
372            snptogroup[cluster] = []
373        snptogroup[cluster].append({"g":gene, "s":snp})
374
375    return  snptogroup
376
377
378def negsumlog10(numbers):
379    """
380    calculate negsumlog10 on a list of numbers
381    """
382    numbers = array.array('d', numbers)
383    neg_sum_log10_value = sum([ math.log10(x) for x in numbers])
384    #this should be -1 * neg_sum_log: abs prevents to write -0.0
385    return abs(neg_sum_log10_value)
386
387
388
Note: See TracBrowser for help on using the repository browser.