source: trunk/brs2010p35/src/jag/drawrandom.py @ 2

Last change on this file since 2 was 2, checked in by tim.te.beek@…, 8 years ago

Add revision 338 snapshot of previous SVN instance, omitting PLINK binaries

File size: 13.0 KB
Line 
1"""
2performs step 4 of the jag pipeline
3
4@author: maartenk, esther lips
5"""
6
7import jag.common as common
8import sys
9import random
10import logging as log
11import os.path
12import time
13
14
15def _get_terminated_time():
16    """
17    get system end time
18    """
19    termtime = time.asctime(time.localtime(time.time()))
20    return("\nAnalysis terminated: " + termtime + "\n")
21
22
23class DrawRandom(object):
24    """
25    Class to draw random genes or snps
26    """
27
28    def __init__(self, genemapping, snpmapping, inoutput):
29        self.genemapping = genemapping
30        self.snpmapping = snpmapping
31        self.inoutput = inoutput
32        self.accessed = None
33        self.exclude = True
34
35
36    def setseed(self, seed):
37        """
38        setting seed to a value: this function is used for testing and debugging
39        """
40        random.seed(seed)
41        log.info("setting seed to value: " + str(seed))
42
43
44class Genes(object):
45    """
46    class for drawing SNP's out of genes
47    """
48    def __init__(self, drawrandom):
49        self.drawrandom = drawrandom
50        self.exclude_genes = {}
51
52
53    def _gene2snpmapping(self):
54        """
55        create a mapping from a gene to SNP mapping file
56        The key a is a gene and the value is a list of SNP's
57        """
58       
59        allsnp_and_genes_fh = common.getfile_handle(self.drawrandom.genemapping)
60        gene2snp_mapping = {}
61        for text in allsnp_and_genes_fh:
62            text_array = text.strip().split("\t")
63           
64            if len(text.strip()) != 0:  # if line is not empty
65                try:
66                    gene2snp_mapping[text_array[1]].append(text_array[0])
67                except KeyError:
68                    gene2snp_mapping[text_array[1]] = [text_array[0]]
69
70        return gene2snp_mapping
71
72
73    def genes(self, group, amount):
74        """
75        Draw random genes, where group is de group name selected
76         and amount the number of new groups to be formed
77        """
78        group_to_snp_mapping = common.map_group_to_snp(self.drawrandom.snpmapping)
79        if (group not in group_to_snp_mapping):
80            log.info("\n" + group + " is not known as a group name. Please check your data for correct group name.")
81            log.info(_get_terminated_time())
82            sys.exit()
83        #load amount of  genes in group by the mapping file
84        self.exclude_genes = set([ x["g"] for x in group_to_snp_mapping[group]])
85
86        ngenes = len(self.exclude_genes)
87        gene2snp = self._gene2snpmapping()  #create gene to all SNP's mapping
88        all_genes = set(gene2snp.keys())
89       
90        if(len(all_genes) <= (ngenes*amount)):
91            log.info("Your sample of " + str(len(all_genes)) + " genes is too small to draw " + str(amount) + " random genesets of size " + str(ngenes))
92            log.info(_get_terminated_time())
93            sys.exit()
94       
95        log.info("Size of gene pool: " + str(len(all_genes)))
96        log.info("Needed size of gene pool to draw desired " + str(amount) + " gene-sets = " + str(ngenes*amount))
97   
98        #exclude genes that are in the selected group
99        if (self.drawrandom.exclude):
100            all_genes = all_genes.difference(set(self.exclude_genes))
101
102        output_text = " RS#\tGeneID\tDraw_#\n"
103       
104        for new_group_number in xrange(1, amount + 1):
105
106            groupname = "Draw_" + str(new_group_number)
107            sampled_keys = random.sample(all_genes, ngenes)
108
109            for gene_as_key in sampled_keys:
110                #draw new_group_number random index
111                output_text += '\n'.join([str(snp) + "\t" + str(gene_as_key) + \
112                        "\t" + groupname for snp in gene2snp[gene_as_key]])
113                output_text += '\n'
114
115        incl_or_excl = "excluding" if self.drawrandom.exclude else "including"      #create correct filename
116        filename = "random_draws_ngenes_" + incl_or_excl + "_" + str(group) + ".txt"
117        out = self.drawrandom.inoutput.save_text_to_filename(filename, output_text)     #save file
118        log.info("\nSaved random draws on number of genes as " + out)
119       
120        return(output_text)
121
122
123class Snp(object):
124    """
125    class for drawing SNP's
126    """
127    def __init__(self, drawrandom):
128        self.drawrandom = drawrandom
129        self.inregion = "all"
130        self.exclude_snps = {}
131
132
133    def _create_indep_snp_file(self, plink):
134        """
135        Create a list of independent snps.
136        plink: a plink object.(only needs a bfile)
137       
138        """
139        log.info("Performing LD based pruning...")
140        plink.set_plink_arguments("--bfile " + plink.bfile + " --indep-pairwise 200 5 0.25")
141        resultfile = plink.run_plink()
142        prune_file = resultfile + ".prune.in"
143     
144        snp2gene_mapping = {}   #create genemapping mapping
145        file_handle = common.getfile_handle(self.drawrandom.genemapping)
146       
147        for text in file_handle:
148            text_array = text.strip().split()
149            if len(text.strip()) != 0:
150                try:
151                    snp2gene_mapping[text_array[0]] += "," + text_array[1]
152                except KeyError:
153                    snp2gene_mapping[text_array[0]] = text_array[1]
154                       
155        # use snp2gene_mapping to map the snp's to genes
156        outfile_text = ""
157        file_handle = common.getfile_handle(prune_file)
158
159        for text in file_handle:
160            rs_number = text.strip()
161            try:
162                outfile_text += rs_number + '\t' + str(snp2gene_mapping[rs_number]) + "\n"  #add mapping
163            except KeyError:
164                outfile_text += str(rs_number) + "\t" + " " + "\n"  #if there is no mapping found add a whitespace(" ")
165       
166        outfile = self.drawrandom.inoutput.save_text_to_filename("prune.in", outfile_text)  #save mapped prune file to prunefile
167        log.info("Saved pruned SNP file as " + outfile)
168       
169        return (outfile)
170
171
172    def _get_random_snp(self, allsnp_and_genes, amount_allsnp_and_genes):
173        """
174        draw SNPs or genes without replacement (drawn items in self.accessed)
175        Also checks optional if gene or snp is in original list group mapping and exclude
176        this gene/snp to be drawn This is regulated by self.exclude (True of False)
177       
178        allsnp_and_genes: list of raw readlines in form "RS12345\t12345\n"
179        amount_allsnp_and_genes: length of amount_allsnp_and_genes
180        """
181       
182        rand_acces = random.randint(0, amount_allsnp_and_genes)
183       
184        if (rand_acces not in self.drawrandom.accessed):
185            self.drawrandom.accessed.add(rand_acces)
186            rsnumber, genename = [a.strip() for a in allsnp_and_genes[rand_acces].split('\t')]
187           
188            if(self.inregion == "all"):
189                pass
190           
191            elif(self.inregion == "genic"):
192                if(genename == ""):
193                    rsnumber, genename = self._get_random_snp(allsnp_and_genes, amount_allsnp_and_genes)
194                   
195            elif(self.inregion == "intergenic"):
196                if(genename != ""):
197                    rsnumber, genename = self._get_random_snp(allsnp_and_genes, amount_allsnp_and_genes)
198
199            if(self.drawrandom.exclude):    #check gene is in the group in snp mapping otherwise draw new
200                if(rsnumber in self.exclude_snps): #allsnp_and_genes.pop(rand_acces)
201                    rsnumber, genename = self._get_random_snp(allsnp_and_genes, amount_allsnp_and_genes)
202
203        else:
204            rsnumber, genename = self._get_random_snp(allsnp_and_genes, amount_allsnp_and_genes)
205
206        return rsnumber, genename
207
208
209    def snp(self, group, amount, empp_file, plink, out):
210        """
211        draw random snp, on number of neff snps in the empp.out file, from an independent snp file (.prune.in). If this file is not
212        present, this file will be created within _create_indep_snp_file function.
213       
214        """
215       
216        group_to_snp_mapping = common.map_group_to_snp(self.drawrandom.snpmapping)
217       
218        if (group not in group_to_snp_mapping):
219            log.info("\n" + group + " is not known as a group name. Please check your data for correct group name.")
220            log.info(_get_terminated_time())
221
222            sys.exit()
223
224        self.exclude_snps = set([ x["s"] for x in group_to_snp_mapping[group]]) #drawsnps
225       
226        try:
227            n_snp = _read_empp_results(empp_file)[group]
228
229        except ValueError:
230            log.info("Group to select is not found in empp file")
231            log.info(_get_terminated_time())
232           
233            sys.exit()
234       
235        prunedin_file = str(os.path.abspath(os.path.curdir)) + "/" + out + ".prune.in" #get pruned.in file
236        prunedin = os.path.exists(prunedin_file)
237       
238        if (prunedin == False): #create prune.in from file, if it does not excist
239            prunedin = self._create_indep_snp_file(plink)
240           
241        else:
242            log.info ("\nUsing the pruned SNP set from " + prunedin_file)
243            prunedin = prunedin_file
244
245        allsnp_and_genes = common.getfile_handle(prunedin).readlines() #pylint: disable=E1103
246       
247        length_genic = 0
248        length_nongenic = 0
249       
250        for line in allsnp_and_genes:
251            line = line.rsplit('\t')
252            gene = line[1].rsplit()
253                     
254            if len(gene) is not 0:
255                length_genic += 1
256            else:
257                length_nongenic += 1
258                                     
259        amount_allsnp_and_genes = len(allsnp_and_genes) - 1
260             
261        random_snp_text = "RS#\tGeneID\tDraw_#\n"
262       
263        for new_group_number in xrange(1, amount + 1):
264            self.drawrandom.accessed = set()
265            groupname = "Draw_" + str(new_group_number)
266
267            count = 0
268            while(count < n_snp):
269                random_snp_text += ("\t".join(self._get_random_snp(allsnp_and_genes, amount_allsnp_and_genes)))
270                random_snp_text += ("\t" + groupname + "\n")
271                count = count + 1
272               
273        inregion_text = "unknown_in_region"
274       
275        if(self.inregion == "genic"):
276            inregion_text = "genic_snps"
277           
278            if amount*n_snp > length_genic:
279                log.info("\nWarning: pool of " + str(length_genic) + " SNPs located within genes is to small to draw " + \
280                 str(amount) + " x " + str(n_snp) + " independent nEff SNPs!")
281                log.info(_get_terminated_time())
282                sys.exit()
283            else:               
284                log.info("\nDrawing " + str(amount) + " x " + str(n_snp) + " nEff SNPs from a pool of " + \
285                 str(length_genic) + " SNPs located within genes")
286           
287        elif(self.inregion == "intergenic"):
288            inregion_text = "intergenic_snps"
289           
290            if amount*n_snp > length_nongenic:
291                log.info("\nWarning: pool of " + str(length_nongenic) + " SNPs located outside genes is to small to draw " + \
292                 str(amount) + " x " + str(n_snp) + " independent nEff SNPs!")
293                log.info(_get_terminated_time())
294                sys.exit()
295            else:
296                log.info("\nDrawing " + str(amount) + " x " + str(n_snp) + " nEff SNPs from a pool of " + \
297                 str(length_nongenic) + " SNPs located outside genes")
298           
299        elif(self.inregion == "all"):
300            inregion_text = "all_snps"
301           
302            if amount*n_snp > len(allsnp_and_genes):
303                log.info("\nWarning: pool of " + str(len(allsnp_and_genes)) + " SNPs located in- and outside genes is to small to draw " + \
304                 str(amount) + " x " + str(n_snp) + " independent nEff SNPs!")
305                log.info(_get_terminated_time())
306                sys.exit()
307            else:
308                log.info("\nDrawing " + str(amount) + " x " + str(n_snp) + " nEff SNPs from a pool of " + \
309                 str(len(allsnp_and_genes)) + " SNPs located in- and outside genes")
310           
311        incl_or_excl = "unknown"
312       
313        if (self.drawrandom.exclude is False):
314            incl_or_excl = "incl"
315           
316        elif (self.drawrandom.exclude is True):
317            incl_or_excl = "excl"
318       
319        filename = "random_draws_neff_" + \
320         inregion_text + "_" + incl_or_excl + "_" + str(group) + ".txt"
321
322        out = self.drawrandom.inoutput.save_text_to_filename(filename, random_snp_text)     #save random snps file
323       
324        log.info("\nSaved random draws on number of effective SNPS as " + out)
325        return(filename)
326
327
328def _read_empp_results(file_name):
329    """
330    Read a empp file and get nEff back as a dict with the groupname as key
331    """
332
333    file_handle = common.getfile_handle(file_name)
334    header_raw = file_handle.readline()
335    header = header_raw.strip().split("\t")
336
337    try:
338        neff_column = header.index("nEff")
339    except ValueError:
340        sys.exit("nEff column not found in header of " + file_name)
341
342    group_neff_mapping = {}
343   
344    for line in file_handle:
345        splitted_line = line.strip().split("\t")
346        group_neff_mapping[str(splitted_line[0])] = int(splitted_line[neff_column])
347   
348    return (group_neff_mapping)
349
350
Note: See TracBrowser for help on using the repository browser.