1 | """MS/MS search engine results parsers. |
---|
2 | |
---|
3 | """ |
---|
4 | import sys |
---|
5 | import os |
---|
6 | import re |
---|
7 | import csv |
---|
8 | import urllib |
---|
9 | import fnmatch |
---|
10 | #from optparse import OptionParser |
---|
11 | from string import Template |
---|
12 | from collections import defaultdict |
---|
13 | from collections import namedtuple |
---|
14 | from itertools import izip, groupby |
---|
15 | |
---|
16 | SEPARATOR = '--gc0p4Jq0M2Yt08jU534c0p' |
---|
17 | # TODO: Use new style string formatting 2.6 and ditch Template |
---|
18 | SECTION_TEMPLATE = Template( |
---|
19 | r'Content-Type: application/x-Mascot; name="$section"') |
---|
20 | |
---|
21 | PEP_REGEX = re.compile(r'^q(\d+)' # nativeid |
---|
22 | '_p(\d+)' # rank |
---|
23 | '=\d+?' # missed cleavages |
---|
24 | ',([\.\d]+?)' # peptide mass |
---|
25 | ',([-\.\d]+?)' # delta mass |
---|
26 | ',\d+?' # n ions matches |
---|
27 | ',(\w+?)' # peptide string |
---|
28 | ',\d+?' # peaks used for Ions1 |
---|
29 | ',(\d+?)' # modstring |
---|
30 | ',([\.\d]+?)' # score |
---|
31 | ',\d+?' # ion series found |
---|
32 | ',\d+?' # peaks used for Ions2 |
---|
33 | ',\d+?' # peaks used for Ions3 |
---|
34 | ';(.+)$' # protein accessions string |
---|
35 | ) |
---|
36 | |
---|
37 | MascotPsm = namedtuple('MascotPsm', |
---|
38 | 'rank ' |
---|
39 | 'pep_mass ' |
---|
40 | 'delta_mass ' |
---|
41 | 'pep_seq ' |
---|
42 | 'score ' |
---|
43 | 'prot_ids' |
---|
44 | ) |
---|
45 | |
---|
46 | mascot_ptms = {'Acetyl (Protein N-term)': 'ac', |
---|
47 | 'Oxidation (M)': 'ox', |
---|
48 | 'Phospho (ST)': 'p', |
---|
49 | 'Dimethyl (K)': 'l', |
---|
50 | 'Dimethyl (N-term)': 'l', |
---|
51 | 'Dimethyl:2H(4) (K)': 'i', |
---|
52 | 'Dimethyl:2H(4) (N-term)': 'i', |
---|
53 | 'Dimethyl:2H(6)13C(2) (K)': 'h', |
---|
54 | 'Dimethyl:2H(6)13C(2) (N-term)': 'h', |
---|
55 | } |
---|
56 | |
---|
57 | class DatParser(object): |
---|
58 | """Mascot Dat Parser |
---|
59 | """ |
---|
60 | def __init__(self, |
---|
61 | protein_factory, |
---|
62 | peptide_factory, |
---|
63 | spectrum_factory, |
---|
64 | psm_factory, |
---|
65 | serparam_factory, |
---|
66 | scan_str=r'FinneganScanNumber%3a%20(\d+)', |
---|
67 | rawfn_str=r'(RawFile|period)%3a%20(.+raw)', |
---|
68 | decoy_str=r'^REV_.*', |
---|
69 | contaminant_str=r'^CON_.*', |
---|
70 | # XXX: Fix default factories |
---|
71 | ): |
---|
72 | self.scan_regex = re.compile(scan_str) |
---|
73 | decoy_regex = re.compile(decoy_str) |
---|
74 | contaminant_regex = re.compile(contaminant_str) |
---|
75 | # Contaminants are not exactly decoy psms |
---|
76 | self.non_target_regexes = (contaminant_regex, decoy_regex) |
---|
77 | self.rawfn_regex = re.compile(rawfn_str) |
---|
78 | self.protein_factory = protein_factory |
---|
79 | self.peptide_factory = peptide_factory |
---|
80 | self.spectrum_factory = spectrum_factory |
---|
81 | self.psm_factory = psm_factory |
---|
82 | self.serparam_factory = serparam_factory |
---|
83 | |
---|
84 | def _parse_params(self, dat_file): |
---|
85 | """Parses SER parameters and returns a SerParam object. |
---|
86 | """ |
---|
87 | for line in dat_file: |
---|
88 | if line.startswith("TOL="): |
---|
89 | prec_tol = line.strip().split('=')[1] |
---|
90 | prec_units = dat_file.next().strip().split('=')[1] |
---|
91 | elif line.startswith("ITOL="): |
---|
92 | frag_tol = line.strip().split('=')[1] |
---|
93 | frag_units = dat_file.next().strip().split('=')[1] |
---|
94 | elif line.startswith("DB="): |
---|
95 | db_name = line.strip().split('=')[1] |
---|
96 | elif line.startswith("CLE="): |
---|
97 | enzyme = line.strip().split('=')[1] |
---|
98 | elif line.startswith("FILE="): |
---|
99 | full_path = line.strip().split('=')[1] |
---|
100 | norm_path = full_path.replace('\\', '/') |
---|
101 | pkl_fn = norm_path.split("/")[-1] |
---|
102 | elif line.startswith("INSTRUMENT="): |
---|
103 | frag_line = line.strip().split('=')[1] |
---|
104 | if frag_line == "ETD-TRAP": |
---|
105 | frag_mode = 'ETD' |
---|
106 | elif frag_line == "ESI-TRAP": |
---|
107 | frag_mode = 'CID' |
---|
108 | elif line.startswith("QUANTITATION="): |
---|
109 | quantitation = line.strip().split('=')[1] |
---|
110 | return self.serparam_factory( |
---|
111 | prec_tol=(prec_tol, prec_units), |
---|
112 | frag_tol=(frag_tol, frag_units), |
---|
113 | db_name=db_name, |
---|
114 | enzyme=enzyme, |
---|
115 | pkl_fn=pkl_fn, |
---|
116 | quantitation=quantitation, |
---|
117 | frag_mode=frag_mode, |
---|
118 | ) |
---|
119 | |
---|
120 | def _parse_masses(self, dat_file): |
---|
121 | """Parses the modifications used by mascot |
---|
122 | """ |
---|
123 | ptms = [None] |
---|
124 | masses_section = SECTION_TEMPLATE.substitute(section="masses") |
---|
125 | for line in dat_file: |
---|
126 | if line.strip() == masses_section: |
---|
127 | break |
---|
128 | |
---|
129 | for line in dat_file: |
---|
130 | if line.startswith("delta"): |
---|
131 | m_ptm = line.strip().split('=')[1].split(',') |
---|
132 | ptm_sym = mascot_ptms[m_ptm[1]] |
---|
133 | ptm = (m_ptm[0], ptm_sym) |
---|
134 | ptms.append(ptm) |
---|
135 | elif line.strip() == SEPARATOR: |
---|
136 | return ptms |
---|
137 | |
---|
138 | def _parse_summary(self, dat_file): |
---|
139 | """Parse summary section |
---|
140 | """ |
---|
141 | summspec = list() |
---|
142 | summary_section = SECTION_TEMPLATE.substitute(section="summary") |
---|
143 | for line in dat_file: |
---|
144 | if line.strip() == summary_section: |
---|
145 | break |
---|
146 | for line in dat_file: |
---|
147 | if line.startswith('qexp'): |
---|
148 | r_mz, r_charge = line.strip().split('=')[1].split(',') |
---|
149 | mz = float(r_mz) |
---|
150 | charge = int(r_charge.rstrip('+')) |
---|
151 | summspec.append((mz, charge)) |
---|
152 | elif line.strip() == SEPARATOR: |
---|
153 | return summspec |
---|
154 | |
---|
155 | def _parse_psms(self, dat_file): |
---|
156 | """Parses the peptide section of mascot |
---|
157 | """ |
---|
158 | nativeid_psms = defaultdict(list) |
---|
159 | # Seek until Mascot peptide section |
---|
160 | peptide_section = SECTION_TEMPLATE.substitute(section="peptides") |
---|
161 | for line in dat_file: |
---|
162 | if line.strip() == peptide_section: |
---|
163 | break |
---|
164 | for line in dat_file: |
---|
165 | match = re.match(PEP_REGEX, line) |
---|
166 | if match: |
---|
167 | nativeid = int(match.group(1)) |
---|
168 | rank = int(match.group(2)) |
---|
169 | pep_mass = float(match.group(3)) |
---|
170 | delta_mass = float(match.group(4)) |
---|
171 | aas = match.group(5) |
---|
172 | mods_str = match.group(6) |
---|
173 | pep_seq = apply_mods(aas, mods_str, self.m_ptms) |
---|
174 | score = float(match.group(7)) |
---|
175 | accs = match.group(8) |
---|
176 | prot_ids = get_prot_ids(accs) |
---|
177 | target_prots = filter_target_prots(prot_ids, |
---|
178 | self.non_target_regexes) |
---|
179 | if target_prots and (rank == 1 or rank == 2): |
---|
180 | mascot_psm = MascotPsm( |
---|
181 | rank=rank, |
---|
182 | pep_mass=pep_mass, |
---|
183 | delta_mass=delta_mass, |
---|
184 | pep_seq=pep_seq, |
---|
185 | score=score, |
---|
186 | prot_ids=target_prots |
---|
187 | ) |
---|
188 | nativeid_psms[nativeid].append(mascot_psm) |
---|
189 | elif line.strip() == SEPARATOR: |
---|
190 | return nativeid_psms |
---|
191 | |
---|
192 | def _parse_extraspec(self, dat_file): |
---|
193 | """Gets the data coming from the input file from Mascot dat |
---|
194 | format. |
---|
195 | """ |
---|
196 | extraspec = list() |
---|
197 | for line in dat_file: |
---|
198 | if line.startswith("title="): |
---|
199 | line = line.strip() |
---|
200 | try: |
---|
201 | quoted_rawfn = re.search(self.rawfn_regex, |
---|
202 | line).group(2) |
---|
203 | rawfn = urllib.unquote(quoted_rawfn) |
---|
204 | except AttributeError: |
---|
205 | sys.exit("It seems the raw regex is not valid. " |
---|
206 | "The line is: " + repr(line)) |
---|
207 | try: |
---|
208 | scan = int(re.search(self.scan_regex, line).group(1)) |
---|
209 | except AttributeError: |
---|
210 | sys.exit("Default scan regex for TITLE field" |
---|
211 | " doesn't match. TITLE: " + repr(line)) |
---|
212 | extraspec.append([rawfn, scan]) |
---|
213 | elif line.startswith("Ions1="): |
---|
214 | peaks = list() |
---|
215 | mascot_ions = line.strip().split("=")[1].split(",") |
---|
216 | for pair in mascot_ions: |
---|
217 | mz, inten = pair.split(':') |
---|
218 | peak = float(mz), float(inten) |
---|
219 | peaks.append(peak) |
---|
220 | extraspec[-1].append(tuple(peaks)) |
---|
221 | |
---|
222 | return extraspec |
---|
223 | |
---|
224 | def parse(self, dat_file, protid_sequence): |
---|
225 | """Takes a dat file and returns a dictionary of psms. |
---|
226 | """ |
---|
227 | serparams = self._parse_params(dat_file) |
---|
228 | self.m_ptms = self._parse_masses(dat_file) |
---|
229 | summspec = self._parse_summary(dat_file) |
---|
230 | nativeid_psms = self._parse_psms(dat_file) |
---|
231 | extraspec = self._parse_extraspec(dat_file) |
---|
232 | proteins = dict() |
---|
233 | spectra = dict() |
---|
234 | peptides = dict() |
---|
235 | psms = dict() |
---|
236 | # TODO: Take all these loops to other functions or methods |
---|
237 | for summ, extra in izip(summspec, extraspec): |
---|
238 | spectrum = self.spectrum_factory( |
---|
239 | prec_mz=summ[0], |
---|
240 | prec_charge=summ[1], |
---|
241 | run=extra[0], |
---|
242 | scan=extra[1], |
---|
243 | peaks=extra[2], |
---|
244 | ) |
---|
245 | spectra['-'.join((str(spectrum.scan), |
---|
246 | spectrum.run))] = spectrum |
---|
247 | # XXX: Clean up this MESS |
---|
248 | for nativeid, m_psms in nativeid_psms.iteritems(): |
---|
249 | for m_psm in m_psms: |
---|
250 | run, spec, peaks = extraspec[nativeid - 1] |
---|
251 | spec_id = '-'.join((str(spec), run)) |
---|
252 | peptide = self.peptide_factory( |
---|
253 | sequence=m_psm.pep_seq, |
---|
254 | mass=m_psm.pep_mass |
---|
255 | ) |
---|
256 | psm = self.psm_factory( |
---|
257 | score=m_psm.score, |
---|
258 | rank=m_psm.rank, |
---|
259 | delta_mass=m_psm.delta_mass, |
---|
260 | pep_ref=peptide, |
---|
261 | spec_ref=spectra[spec_id], |
---|
262 | params_ref=serparams |
---|
263 | ) |
---|
264 | psms['--'.join((peptide.formatted_seq(), spec_id))] = psm |
---|
265 | spectra[spec_id].psm_refs.add(psm) |
---|
266 | if peptide.formatted_seq() not in peptides: |
---|
267 | peptide.psm_refs = set([psm]) |
---|
268 | peptides[peptide.formatted_seq()] = peptide |
---|
269 | else: |
---|
270 | peptide = peptides[peptide.formatted_seq()] |
---|
271 | peptide.psm_refs.add(psm) |
---|
272 | for prot_id in m_psm.prot_ids: |
---|
273 | if prot_id not in proteins: |
---|
274 | protein = self.protein_factory( |
---|
275 | native_id=prot_id, |
---|
276 | sequence=protid_sequence[prot_id][0], |
---|
277 | ext_ids = protid_sequence[prot_id][1], |
---|
278 | description=protid_sequence[prot_id][2], |
---|
279 | pep_refs=set([peptide]) |
---|
280 | ) |
---|
281 | proteins[prot_id] = protein |
---|
282 | else: |
---|
283 | proteins[prot_id].pep_refs.add(peptide) |
---|
284 | return proteins, peptides, spectra, psms |
---|
285 | |
---|
286 | def apply_mods(aas, mods_str, ptms): |
---|
287 | """Takes a naked peptides sequence, the mod string from Mascot and ptm |
---|
288 | map and returns a complete peptide sequence |
---|
289 | """ |
---|
290 | f_aas = ''.join(('.', aas, '.')) |
---|
291 | pep_seq = list() |
---|
292 | for a, m in izip(f_aas, mods_str): |
---|
293 | aa_m = (a, ptms[int(m)]) |
---|
294 | pep_seq.append(aa_m) |
---|
295 | return tuple(pep_seq) |
---|
296 | |
---|
297 | def get_prot_ids(accs): |
---|
298 | """From a Mascot accessions field return a tuple with all proteins |
---|
299 | ids. |
---|
300 | """ |
---|
301 | return tuple(x[1].strip('"') |
---|
302 | for x in (y.split(':') |
---|
303 | for y in accs.split(',"') |
---|
304 | ) |
---|
305 | ) |
---|
306 | |
---|
307 | def filter_target_prots(prot_ids, regexes): |
---|
308 | """Filters a list of protein ids for target protein ids |
---|
309 | """ |
---|
310 | target_prots = list() |
---|
311 | for prot_id in prot_ids: |
---|
312 | non_target_flag = True |
---|
313 | for regex in regexes: |
---|
314 | if re.match(regex, prot_id): |
---|
315 | non_target_flag = False |
---|
316 | if non_target_flag: |
---|
317 | target_prots.append(prot_id) |
---|
318 | return target_prots |
---|
319 | |
---|
320 | def write_csv(psms, out_file): |
---|
321 | """Takes a dictionary of PSMs and writes a modified InsPecT output. |
---|
322 | """ |
---|
323 | fieldnames = ("#SpectraFile", |
---|
324 | "RawFile", |
---|
325 | "Fraction", |
---|
326 | "Scan", |
---|
327 | "Annotation", |
---|
328 | "Charge", |
---|
329 | "FragMode", |
---|
330 | "IsDecoy") |
---|
331 | header = dict() |
---|
332 | for fieldname in fieldnames: |
---|
333 | header[fieldname] = fieldname |
---|
334 | writer = csv.DictWriter(out_file, fieldnames, dialect='excel-tab') |
---|
335 | for psm in psms: |
---|
336 | writer.writerow(psm) |
---|
337 | |
---|
338 | def parse_fasta(fasta_fn): |
---|
339 | """Update protein sequence container from fasta file""" |
---|
340 | with open(fasta_fn, 'rb') as fasta_file: |
---|
341 | # Ditching the key of each iterator group because we know |
---|
342 | # the headers and sequence alternate all the time |
---|
343 | protid_sequence = defaultdict(str) |
---|
344 | groupgen = (i[1] for i in groupby(fasta_file, |
---|
345 | lambda l: l.startswith(">"))) |
---|
346 | for header in groupgen: |
---|
347 | header_line = header.next().strip() |
---|
348 | if header_line.startswith(">IPI:IPI"): |
---|
349 | ext_ids = dict() |
---|
350 | description = str() |
---|
351 | header_fields = header_line.split("|") |
---|
352 | prot_id = header_fields[0][5:] |
---|
353 | for field in header_fields[1:]: |
---|
354 | if not ' ' in field: |
---|
355 | db, db_id = field.split(':') |
---|
356 | ext_ids[db] = db_id |
---|
357 | else: |
---|
358 | trailing_fields = field.split(' ') |
---|
359 | for t_field in trailing_fields: |
---|
360 | if ":" in t_field: |
---|
361 | db, db_id = t_field.split(":") |
---|
362 | ext_ids[db] = db_id |
---|
363 | elif "=" in t_field: |
---|
364 | db, db_ids = t_field.split("=") |
---|
365 | # Take only 1st ID in case there are many |
---|
366 | db_id = db_ids.split(";")[0] |
---|
367 | ext_ids[db] = db_id |
---|
368 | else: |
---|
369 | description = " ".join((description[:], |
---|
370 | t_field)) |
---|
371 | sequence = "".join(s.strip() for s in groupgen.next()) |
---|
372 | protid_sequence[prot_id] = sequence, ext_ids, description |
---|
373 | return protid_sequence |
---|
374 | |
---|
375 | # XXX: Use better defaults for containers and factories |
---|
376 | def parse_dats(dats_dir, proteins_container=dict, peptides_container=dict, |
---|
377 | spectra_container=dict, psms_container=dict, |
---|
378 | protein_factory=dict, peptide_factory=dict, |
---|
379 | spectrum_factory=dict, psm_factory=dict, |
---|
380 | serparam_factory=dict): |
---|
381 | """Parses all the dat files in dats_dir |
---|
382 | """ |
---|
383 | proteins = proteins_container |
---|
384 | peptides = peptides_container |
---|
385 | spectra = spectra_container |
---|
386 | psms = psms_container |
---|
387 | dat_parser = DatParser(protein_factory=protein_factory, |
---|
388 | peptide_factory=peptide_factory, |
---|
389 | spectrum_factory=spectrum_factory, |
---|
390 | psm_factory=psm_factory, |
---|
391 | serparam_factory=serparam_factory) |
---|
392 | import transaction |
---|
393 | for root, dirs, files in os.walk(dats_dir): |
---|
394 | for fn in files: |
---|
395 | if fnmatch.fnmatch(fn, '*.fasta'): |
---|
396 | protid_sequence = parse_fasta(os.path.join(root, fn)) |
---|
397 | for root, dirs, files in os.walk(dats_dir): |
---|
398 | for fn in files: |
---|
399 | if fnmatch.fnmatch(fn, '*.dat'): |
---|
400 | with open(os.path.join(root, fn)) as dat_file: |
---|
401 | ser_proteins, ser_peptides, \ |
---|
402 | ser_spectra, ser_psms = dat_parser.parse(dat_file, |
---|
403 | protid_sequence) |
---|
404 | proteins.update(ser_proteins) |
---|
405 | transaction.savepoint() |
---|
406 | for pep_id, pep in ser_peptides.items(): |
---|
407 | if not pep_id in peptides: |
---|
408 | peptides[str(pep_id)] = pep |
---|
409 | transaction.savepoint() |
---|
410 | for spec_id, spec in ser_spectra.items(): |
---|
411 | if spec_id not in spectra: |
---|
412 | spectra[str(spec_id)] = spec |
---|
413 | transaction.savepoint() |
---|
414 | for psm_id, psm in ser_psms.items(): |
---|
415 | if psm not in psms: |
---|
416 | psms[str(psm_id)] = psm |
---|
417 | transaction.savepoint() |
---|
418 | elif fnmatch.fnmatch(fn, '*.fasta'): |
---|
419 | fasta_path = os.path.join(root, fn) |
---|
420 | |
---|
421 | """ |
---|
422 | def main(): |
---|
423 | usage = "usage: %prog [options] mascotresult1.dat [mascotresult2.dat ...]" |
---|
424 | optparser = OptionParser(usage) |
---|
425 | optparser.add_option("-o", |
---|
426 | "--output", |
---|
427 | dest="out_fn", |
---|
428 | help="Output file [default: stdout]", |
---|
429 | metavar="FILE") |
---|
430 | optparser.add_option("-s", |
---|
431 | "--scans-in-title", |
---|
432 | dest="scans_in_title", |
---|
433 | type="int", |
---|
434 | default="1", |
---|
435 | help="1 to parse the scan number from the TITLE field, " |
---|
436 | "0 to parse the scan number from the SCAN field " |
---|
437 | "[default: %default]") |
---|
438 | optparser.add_option("-d", |
---|
439 | "--decoy-str", |
---|
440 | dest="decoy_str", |
---|
441 | default=r'^IPI:REV_:IPI', |
---|
442 | help="1 to parse the scan number from the TITLE field, " |
---|
443 | "0 to parse the scan number from the SCAN field " |
---|
444 | "[default: %default]") |
---|
445 | (options, args) = optparser.parse_args() |
---|
446 | |
---|
447 | if len(args) == 0: |
---|
448 | optparser.print_usage() |
---|
449 | |
---|
450 | else: |
---|
451 | for arg in args: |
---|
452 | if not arg.endswith(".dat"): |
---|
453 | optparser.error("Provide only mascot dat files") |
---|
454 | else: |
---|
455 | if options.out_fn: |
---|
456 | out_fn = options.out_file |
---|
457 | out_file = open(out_fn, 'wb') |
---|
458 | else: |
---|
459 | out_file = sys.stdout |
---|
460 | for arg in args: |
---|
461 | with open(arg) as in_file: |
---|
462 | dat_parser = DatParser(options.decoy_str, |
---|
463 | scans_in_title = options.scans_in_title) |
---|
464 | psms = dat_parser.parse(in_file) |
---|
465 | write_csv(psms, out_file) |
---|
466 | |
---|
467 | if __name__ == '__main__': |
---|
468 | main() |
---|
469 | """ |
---|