1 | """Mascot dat format to InsPecT output converter. |
---|
2 | |
---|
3 | """ |
---|
4 | import sys |
---|
5 | import os |
---|
6 | import re |
---|
7 | import csv |
---|
8 | from optparse import OptionParser |
---|
9 | from string import Template |
---|
10 | from collections import defaultdict |
---|
11 | |
---|
12 | SEPARATOR = '--gc0p4Jq0M2Yt08jU534c0p' |
---|
13 | SECTION_TEMPLATE = Template( |
---|
14 | r'Content-Type: application/x-Mascot; name="$section"') |
---|
15 | |
---|
16 | PEP_REGEX = re.compile(r'^q(\d+)' # nativeid |
---|
17 | '_p\d+' # rank |
---|
18 | '=\d+?' # missed cleavages |
---|
19 | ',[\.\d]+?' # peptide mass |
---|
20 | ',[-\.\d]+?' # delta mass |
---|
21 | ',\d+?' # n ions matches |
---|
22 | ',(\w+?)' # peptide string |
---|
23 | ',\d+?' # peaks used for Ions1 |
---|
24 | ',(\d+?)' # modstring |
---|
25 | ',[\.\d]+?' # score |
---|
26 | ',\d+?' # ion series found |
---|
27 | ',\d+?' # peaks used for Ions2 |
---|
28 | ',\d+?' # peaks used for Ions3 |
---|
29 | ';(.+)$' # protein accessions string |
---|
30 | ) |
---|
31 | #fraction_regex = re.compile(r'fraction%3a%20(\d+)') |
---|
32 | RAWFN_REGEX = re.compile(r'raw%3a%20(.+raw)') |
---|
33 | |
---|
34 | class DatParser(object): |
---|
35 | """Mascot Dat Parser |
---|
36 | """ |
---|
37 | def __init__(self, decoy_str, scans_in_title=False): |
---|
38 | self.decoy_regex = re.compile(decoy_str) |
---|
39 | self.scans_in_title = scans_in_title |
---|
40 | if scans_in_title: |
---|
41 | self.scan_regex = re.compile(r'scan%3a%20(\d+)') |
---|
42 | |
---|
43 | def _parse_spectra_fn(self, datfile): |
---|
44 | """Parses the spectra file name used for Mascot search |
---|
45 | """ |
---|
46 | for line in datfile: |
---|
47 | if line.startswith("FILE="): |
---|
48 | #XXX: There must be something in the standard library |
---|
49 | # to normalize this properly |
---|
50 | full_path = line.strip().split('=')[1] |
---|
51 | norm_path = full_path.replace("\\", "/") |
---|
52 | return norm_path.split("/")[-1] |
---|
53 | |
---|
54 | def _parse_frag_mode(self, datfile): |
---|
55 | """Get the fragmentation type by looking at dat parameters |
---|
56 | """ |
---|
57 | # TODO: Handle CID and error for other instruments |
---|
58 | for line in datfile: |
---|
59 | if line.startswith("INSTRUMENT="): |
---|
60 | if line.strip().split('=')[1] == "ETD-TRAP": |
---|
61 | return 'ETD' |
---|
62 | else: |
---|
63 | return 'CID' |
---|
64 | |
---|
65 | def _parse_annotations(self, datfile): |
---|
66 | """Parses the peptide section of mascot |
---|
67 | """ |
---|
68 | nativeid_annotations = defaultdict(list) |
---|
69 | # Seek until Mascot peptide section |
---|
70 | peptide_section = SECTION_TEMPLATE.substitute(section="peptides") |
---|
71 | for line in datfile: |
---|
72 | if line.strip() == peptide_section: |
---|
73 | break |
---|
74 | for line in datfile: |
---|
75 | match = re.match(PEP_REGEX, line) |
---|
76 | if match: |
---|
77 | nativeid = int(match.group(1)) |
---|
78 | peptide_str = match.group(2) |
---|
79 | mods_str = match.group(3) |
---|
80 | accs = match.group(4) |
---|
81 | peptide = apply_mods(peptide_str, mods_str) |
---|
82 | is_decoy = check_decoy(accs, self.decoy_regex) |
---|
83 | # Assuming that after a peptide line, always the next line is |
---|
84 | # the line with flanking Aa |
---|
85 | # q4_p2_terms=E,K:E,K:E,K:E,K:E,K |
---|
86 | line = datfile.next().strip() |
---|
87 | # (('E', 'K'), ('E', 'K'), ('E', 'K')) |
---|
88 | flanking_pairs = set(tuple(x.split(',')) |
---|
89 | for x in line.split("=")[1].split(":")) |
---|
90 | for flanking_pair in flanking_pairs: |
---|
91 | annotation = '.'.join(( |
---|
92 | flanking_pair[0], peptide, flanking_pair[1])) |
---|
93 | annotation = annotation.replace('-', '*') |
---|
94 | nativeid_annotations[nativeid].append(dict( |
---|
95 | Annotation=annotation, IsDecoy=is_decoy)) |
---|
96 | elif line.strip() == SEPARATOR: |
---|
97 | return nativeid_annotations |
---|
98 | |
---|
99 | def _parse_spectra(self, datfile): |
---|
100 | """Gets the data coming from the input file from Mascot dat |
---|
101 | format |
---|
102 | """ |
---|
103 | nativeid_spectra = defaultdict(dict) |
---|
104 | nativeid = 1 |
---|
105 | for line in datfile: |
---|
106 | nativeid_str = ''.join(("query", str(nativeid))) |
---|
107 | query_line = SECTION_TEMPLATE.substitute(section=nativeid_str) |
---|
108 | if line.strip() == query_line: |
---|
109 | current_id_mgf = nativeid_spectra[nativeid] |
---|
110 | nativeid += 1 |
---|
111 | elif line.startswith("title="): |
---|
112 | line = line.strip() |
---|
113 | #TODO: handle fractions |
---|
114 | #fraction = int(re.search(fraction_regex, line).group(1)) |
---|
115 | #current_id_mgf["Fraction"] = fraction |
---|
116 | try: |
---|
117 | rawfn = re.search(RAWFN_REGEX, line).group(1) |
---|
118 | except AttributeError: |
---|
119 | sys.exit("It seems there is no raw: field in TITLE") |
---|
120 | rawfn = rawfn.replace('%2e', '.') |
---|
121 | current_id_mgf["RawFile"] = rawfn |
---|
122 | if self.scans_in_title: |
---|
123 | try: |
---|
124 | scan = int(re.search(self.scan_regex, line).group(1)) |
---|
125 | except AttributeError: |
---|
126 | sys.exit("Default scan regex for TITLE field" |
---|
127 | " doesn't match anything") |
---|
128 | current_id_mgf['Scan'] = scan |
---|
129 | elif line.startswith("charge="): |
---|
130 | charge = line.strip().split('=')[1] |
---|
131 | current_id_mgf['Charge'] = charge |
---|
132 | elif not self.scans_in_title and line.startswith("scans="): |
---|
133 | scan = int(line.strip().split('=')[2]) |
---|
134 | current_id_mgf['Scan'] = scan |
---|
135 | return nativeid_spectra |
---|
136 | |
---|
137 | def parse(self, datfile): |
---|
138 | """Takes a dat file and returns a dictionary of psms. |
---|
139 | """ |
---|
140 | # TODO: handle frag_mode from mgf data |
---|
141 | pkl_fn = self._parse_spectra_fn(datfile) |
---|
142 | frag_mode = self._parse_frag_mode(datfile) |
---|
143 | nativeid_annotations = self._parse_annotations(datfile) |
---|
144 | nativeid_spectra = self._parse_spectra(datfile) |
---|
145 | for native_id, annotations in nativeid_annotations.items(): |
---|
146 | for annotation in annotations: |
---|
147 | annotation['#SpectraFile'] = pkl_fn |
---|
148 | annotation['FragMode'] = frag_mode |
---|
149 | annotation.update(nativeid_spectra[native_id]) |
---|
150 | yield annotation |
---|
151 | |
---|
152 | def apply_mods(peptide_str, mod_str): |
---|
153 | """Takes a peptide string and applies mascot modification string. It |
---|
154 | only supports C+57. |
---|
155 | """ |
---|
156 | mod_str = mod_str[1:-1] |
---|
157 | return ''.join((match != '0' and (p + '+57') or p for |
---|
158 | p, match in zip(peptide_str, mod_str))) |
---|
159 | |
---|
160 | def check_decoy(accs, decoy_regex): |
---|
161 | """Returns True if there is at least one target protein in Mascot |
---|
162 | protein accession string. |
---|
163 | """ |
---|
164 | prots = (y[1] for y in (x.split('"') for x in accs.split(','))) |
---|
165 | for prot in prots: |
---|
166 | match = re.search(decoy_regex, prot) |
---|
167 | if not match: |
---|
168 | return False |
---|
169 | return True |
---|
170 | |
---|
171 | def write_csv(psms, out_file): |
---|
172 | """Takes a dictionary of PSMs and writes a modified InsPecT output. |
---|
173 | """ |
---|
174 | fieldnames = ("#SpectraFile", |
---|
175 | "RawFile", |
---|
176 | "Fraction", |
---|
177 | "Scan", |
---|
178 | "Annotation", |
---|
179 | "Charge", |
---|
180 | "FragMode", |
---|
181 | "IsDecoy") |
---|
182 | header = dict() |
---|
183 | for fieldname in fieldnames: |
---|
184 | header[fieldname] = fieldname |
---|
185 | writer = csv.DictWriter(out_file, fieldnames, dialect='excel-tab') |
---|
186 | for psm in psms: |
---|
187 | writer.writerow(psm) |
---|
188 | |
---|
189 | def parse_dats(dats_dir, proteins_factory=dict, peptides_factory=dict, |
---|
190 | spectra_factory=dict, psms_factory=dict): |
---|
191 | """Parses all the dat files in dats_dir |
---|
192 | """ |
---|
193 | dat_paths = (f for f in os.walk(dats_dir)) |
---|
194 | proteins = proteins_factory() |
---|
195 | peptides = peptides_factory() |
---|
196 | spectra = spectra_factory() |
---|
197 | psms = psms_factory() |
---|
198 | dat_parser = DatParser(decoy_str, scans_in_title) |
---|
199 | for dat_path in dat_paths: |
---|
200 | with open(dat_path) as dat_file: |
---|
201 | dat_proteins, dat_peptides, \ |
---|
202 | dat_spectra, dat_psms = dat_parser.parse(dat_file) |
---|
203 | proteins.update(dat_proteins) |
---|
204 | peptides.update(dat_peptides) |
---|
205 | spectra.update(dat_spectra) |
---|
206 | psms.update(dat_psms) |
---|
207 | return proteins, peptides, spectra, psms |
---|
208 | |
---|
209 | def main(): |
---|
210 | usage = "usage: %prog [options] mascotresult1.dat [mascotresult2.dat ...]" |
---|
211 | optparser = OptionParser(usage) |
---|
212 | optparser.add_option("-o", |
---|
213 | "--output", |
---|
214 | dest="out_fn", |
---|
215 | help="Output file [default: stdout]", |
---|
216 | metavar="FILE") |
---|
217 | optparser.add_option("-s", |
---|
218 | "--scans-in-title", |
---|
219 | dest="scans_in_title", |
---|
220 | type="int", |
---|
221 | default="1", |
---|
222 | help="1 to parse the scan number from the TITLE field, " |
---|
223 | "0 to parse the scan number from the SCAN field " |
---|
224 | "[default: %default]") |
---|
225 | optparser.add_option("-d", |
---|
226 | "--decoy-str", |
---|
227 | dest="decoy_str", |
---|
228 | default="^IPI:REV_:IPI", |
---|
229 | help="1 to parse the scan number from the TITLE field, " |
---|
230 | "0 to parse the scan number from the SCAN field " |
---|
231 | "[default: %default]") |
---|
232 | (options, args) = optparser.parse_args() |
---|
233 | |
---|
234 | if len(args) == 0: |
---|
235 | optparser.print_usage() |
---|
236 | |
---|
237 | else: |
---|
238 | for arg in args: |
---|
239 | if not arg.endswith(".dat"): |
---|
240 | optparser.error("Provide only mascot dat files") |
---|
241 | else: |
---|
242 | if options.out_fn: |
---|
243 | out_fn = options.out_file |
---|
244 | out_file = open(out_fn, 'wb') |
---|
245 | else: |
---|
246 | out_file = sys.stdout |
---|
247 | for arg in args: |
---|
248 | with open(arg) as in_file: |
---|
249 | dat_parser = DatParser(options.decoy_str, |
---|
250 | scans_in_title = options.scans_in_title) |
---|
251 | psms = dat_parser.parse(in_file) |
---|
252 | write_csv(psms, out_file) |
---|
253 | |
---|
254 | if __name__ == '__main__': |
---|
255 | main() |
---|