source: trunk/brs2010p35/src/jag/snp2geneset.py @ 2

Last change on this file since 2 was 2, checked in by tim.te.beek@…, 8 years ago

Add revision 338 snapshot of previous SVN instance, omitting PLINK binaries

File size: 9.3 KB
Line 
1'''
2Created on May 9, 2011
3
4@author: maartenk, esther lips
5'''
6
7import jag.common as common
8import bisect
9from operator import itemgetter
10import logging as log
11
12
13def _read_snp_file(snp_data_file, all_chromosomes):
14    """
15    Read snp_data file and skip the chromosomes that are not
16    present in the list all_chromosomes
17    """
18    chrsnpmapping = {}
19    for chr_name in all_chromosomes:
20        chrsnpmapping[chr_name] = []
21
22    snp_file_handle = common.getfile_handle(snp_data_file)
23    old_chr = ""
24
25    for line in snp_file_handle:
26        try:
27            snp_data = line.strip().split("\t")
28            current_chr = snp_data[1]
29           
30            #check chromosome is present in list of needed chromosomes
31            if not current_chr in all_chromosomes:               
32                if not old_chr == current_chr:
33                    log.info("There are no genes on chromosome " + current_chr)
34             
35                old_chr = current_chr
36               
37            else:
38                #print message when new chromosome is found
39                if not old_chr == current_chr:
40                    log.info("Mapping SNPs to genes on chromosome " + str(current_chr))
41                    old_chr = current_chr
42
43                #save snp
44                rsnumber = snp_data[0]
45                try:
46                    chr_location = int(snp_data[2])
47                    chrsnpmapping[current_chr].append((chr_location, rsnumber))
48
49                except ValueError:
50                    log.info("Ignoring following line in SNP file: " + "\t".join(snp_data))
51                   
52        except IndexError:
53            log.info("indexerror" + str(line))
54           
55        except ValueError:
56            log.info (line)
57           
58    log.info("")
59   
60    return(chrsnpmapping)
61
62
63def find_snps(chrsnpmapping, current_chromosome, sorted_keys, gene, leftside, rightside):
64    """
65    Map snps to genes
66    """
67    #searching for snps
68    left_index = bisect.bisect_left(sorted_keys, leftside)
69    right_index = bisect.bisect_right(sorted_keys, rightside)
70
71    if (left_index == 0 and right_index == 0):
72        pass
73
74    snps_raw = chrsnpmapping[current_chromosome][left_index:right_index]
75 
76    return snps_raw
77
78
79def _lookupgenes(start_ext_base, end_ext_base, chrsnpmapping, all_genes_from_all_groups):
80    """
81    lookup all genes in the list all_genes_from_all_groups.
82    The SNPs must be saved in chrsnpmapping  which is made by _read_snp_file()
83    The upstream and downstream SNPs of a gene which is included in the mapping
84    is specified by  start_ext_kb and end_ext_kb and should be in base
85    """
86    current_chromosome = None
87    sorted_keys = []
88   
89    for gene in all_genes_from_all_groups:
90        if not current_chromosome == gene["chr_number"]:
91            current_chromosome = gene["chr_number"] #order chromosome
92            chrsnpmapping[current_chromosome].sort(key=lambda r:r[0]) #create search on key list
93            sorted_keys = [r[0] for r in chrsnpmapping[current_chromosome]]
94        #calculate start and end of gene
95        leftside = 0
96        rightside = 0
97       
98        if gene["chr_strand"] == "+":
99            leftside = gene["chrstart"] - start_ext_base
100            rightside = gene["chrend"] + end_ext_base
101       
102        elif gene["chr_strand"] == "-":
103            rightside = gene["chrend"] + start_ext_base
104            leftside = gene["chrstart"] - end_ext_base
105       
106        else:
107            log.info("Invalid strand position" + str(gene["chr_strand"]))
108       
109        snps_raw = find_snps(chrsnpmapping, current_chromosome, sorted_keys, gene, leftside, rightside)
110        gene["snp_list"] = set([rs_tuple[1] for rs_tuple in snps_raw])
111       
112    return all_genes_from_all_groups
113
114
115def _read_group_gene_mapping(infile):
116    """
117    read file with genes for gene mapping
118    """
119    gene_group_mapping = [] #read gene file
120    file_handle = common.getfile_handle(infile)
121   
122    for line in file_handle: 
123        gene_group_mapping.extend([line.strip().split("\t")])
124       
125    return gene_group_mapping
126
127
128def unique(seq): 
129    """
130    get uniques
131    """ 
132    # order preserving
133    checked = [] 
134    for e in seq: 
135        if e not in checked: 
136            checked.append(e) 
137    return checked
138
139
140def _create_gene2group_mapping(gene_group_mapping):
141    """
142    map genes per gene-set
143    """
144    group2gene = {}
145    # get unique group members
146    un_gene_group_mapping = unique(gene_group_mapping)
147   
148    for line in un_gene_group_mapping:
149        try:
150            group2gene[line[1]].append(line[0])
151        except KeyError:
152            group2gene[line[1]] = [line[0]]
153
154    return group2gene
155
156
157def _read_gene_boundaries(gene_boundaries_file, genes_in_groups):
158    """
159    # read gene boundaries
160    """
161    gb_file = common.getfile_handle(gene_boundaries_file)
162    genes_with_boundaries = []
163    header = gb_file.readline() 
164
165    for line in gb_file:
166        gene_data = line.strip().split("\t")
167     
168        if gene_data[0] in genes_in_groups:
169            genes_in_groups.remove(gene_data[0]) #save gene information
170            chr_number = gene_data[2]
171            chrstrart = int(gene_data[3])
172            chrend = int(gene_data[4])
173            chr_strand = gene_data[5]
174            symbol = gene_data[1]
175            gene_info = {"geneid":gene_data[0],
176                        "chr_number":chr_number,
177                        "chrstart":chrstrart,
178                        "chrend":chrend,
179                        "chr_strand":chr_strand,
180                        "symbol":symbol,
181                        "snp_list":[]
182                        }
183            genes_with_boundaries.append(gene_info)
184   
185    genes_without_boundaries = genes_in_groups
186    return(genes_with_boundaries, genes_without_boundaries)
187
188
189def _remove_genes_without_boundaries_from_groups(group2gene, non_found_genes):
190    """
191    """
192    if not len(non_found_genes) == 0:
193        log.info("No gene boundaries are found for:")
194        for gene in non_found_genes:
195            for genes in group2gene.items():
196                if gene in genes[1]:
197                    log.info("geneID " + gene ) #+ " in " + genes[0])
198                    genes[1].remove(gene)
199        log.info("")
200    return group2gene
201
202
203def _map_lookedup_snps_to_group(all_genes_from_all_groups, group2gene):
204    """
205    """
206    geneid2gene = {}
207    snp2all_gene = {}
208    genelist = []
209   
210    for gene in all_genes_from_all_groups:
211        geneid2gene[gene["geneid"]] = gene
212        info = [gene["geneid"], gene["snp_list"]]
213        genelist.append(info)
214   
215    snp2all_gene['CODING_GENES'] = genelist
216   
217    group2snp = {}
218   
219    for (group, genes) in group2gene.iteritems():
220        try:
221            allsnps_per_group = [(geneid, geneid2gene[geneid]["snp_list"]) for geneid in genes]
222            group2snp[group] = allsnps_per_group
223
224        except KeyError:
225            log.info("")
226            log.info("Cannot not find SNPs for geneID " + str(genes))
227       
228    return snp2all_gene, group2snp
229
230
231def format_mapping(group2snp):
232    output_text = ""
233
234    for group, snps_tupple in group2snp.iteritems():
235        for gene_and_snps in snps_tupple:
236            snps = gene_and_snps[1]
237            gene = gene_and_snps[0]
238
239            if(len(snps) is not 0): # to avoid white lines in output
240                output_text += '\n'.join([snp + "\t" + gene + "\t" + group for snp in snps])
241                output_text += '\n'
242    return output_text
243
244
245def snp2geneset(infile, start_ext_kb, end_ext_kb, gene_boundaries_file, snp_data_file):
246    """
247    MAIN annotate snps to genes
248    """
249    # calculate start extra in base pair instead in kilo base pair
250    start_ext_kb = float(start_ext_kb) * 1000
251    end_ext_kb = float(end_ext_kb) * 1000
252
253    # read gene-set file
254    gene_group_mapping = _read_group_gene_mapping(infile)   
255    group2gene = _create_gene2group_mapping(gene_group_mapping) #create a group to gene mapping
256    genes_in_groups = set([group[0] for group in gene_group_mapping]) #find all unique genes
257       
258    # read data for all genes
259    all_genes_to_map = _read_group_gene_mapping(gene_boundaries_file) # read in datafile for all genes   
260    all_genes = set([gene[0] for gene in all_genes_to_map])   
261   
262    log.info("Found " + str(len(all_genes)) + " unique genes in " + gene_boundaries_file ) 
263    log.info("Found " + str(len(genes_in_groups)) + " unique genes in " + infile + "\n") 
264   
265    # remove genes from gene-sets that do not have gene boundary info
266    genes_with_boundaries, genes_without_boundaries = _read_gene_boundaries(gene_boundaries_file, genes_in_groups)
267    group2gene = _remove_genes_without_boundaries_from_groups(group2gene, genes_without_boundaries)
268     
269    # remove genes from gene-sets that do not have gene boundary info
270    all_genes_with_boundaries, all_genes_without_boundaries = _read_gene_boundaries(gene_boundaries_file, all_genes)
271    all_genes_with_boundaries = sorted(all_genes_with_boundaries, key=itemgetter('chr_number'))
272    all_chromosomes = set([gene["chr_number"] for gene in all_genes_with_boundaries])
273   
274    chrsnpmapping = _read_snp_file(snp_data_file, all_chromosomes)  # read snp file
275    print("Saving results...")
276     
277    all_genes_with_snps = _lookupgenes(start_ext_kb, end_ext_kb, chrsnpmapping, all_genes_with_boundaries)
278
279    snp2all_genes,group2snp = _map_lookedup_snps_to_group(all_genes_with_snps, group2gene)
280   
281    return(snp2all_genes, group2snp)
282
283
Note: See TracBrowser for help on using the repository browser.