"""MS/MS search engine results parsers. """ import sys import os import re import csv import urllib from optparse import OptionParser from string import Template from collections import defaultdict SEPARATOR = '--gc0p4Jq0M2Yt08jU534c0p' SECTION_TEMPLATE = Template( r'Content-Type: application/x-Mascot; name="$section"') PEP_REGEX = re.compile(r'^q(\d+)' # nativeid '_p\d+' # rank '=\d+?' # missed cleavages ',[\.\d]+?' # peptide mass ',[-\.\d]+?' # delta mass ',\d+?' # n ions matches ',(\w+?)' # peptide string ',\d+?' # peaks used for Ions1 ',(\d+?)' # modstring ',[\.\d]+?' # score ',\d+?' # ion series found ',\d+?' # peaks used for Ions2 ',\d+?' # peaks used for Ions3 ';(.+)$' # protein accessions string ) #fraction_regex = re.compile(r'fraction%3a%20(\d+)') class DatParser(object): """Mascot Dat Parser """ def __init__(self, scan_str=r'FinneganScanNumber%3a%20(\d+)', rawfn_str=r'RawFile%3a%20(.+raw)', decoy_str=r'^IPI:REV_:IPI', contaminant_str=r'^IPI:CON_:IPI' ): self.scan_regex = re.compile(scan_str) self.decoy_regex = re.compile(decoy_str) self.rawfn_regex = re.compile(rawfn_str) self.contaminant_regex = re.compile(contaminant_str) def _parse_spectra_fn(self, dat_file): """Parses the spectra file name used for Mascot search """ for line in dat_file: if line.startswith("FILE="): full_path = line.strip().split('=')[1] norm_path = full_path.replace('\\', '/') return norm_path.split("/")[-1] def _parse_frag_mode(self, dat_file): """Get the fragmentation type by looking at dat parameters """ # TODO: Handle CID and error for other instruments for line in dat_file: if line.startswith("INSTRUMENT="): frag_line = line.strip().split('=')[1] if frag_line == "ETD-TRAP": return 'ETD' elif frag_line == "ESI-TRAP": return 'CID' else: return frag_line def _parse_psms(self, dat_file): """Parses the peptide section of mascot """ nativeid_psms = defaultdict(list) # Seek until Mascot peptide section peptide_section = SECTION_TEMPLATE.substitute(section="peptides") for line in dat_file: if line.strip() == peptide_section: break for line in dat_file: match = re.match(PEP_REGEX, line) if match: import ipdb; ipdb.set_trace() nativeid = int(match.group(1)) peptide_str = match.group(2) mods_str = match.group(3) accs = match.group(4) peptide = apply_mods(peptide_str, mods_str) is_decoy = check_decoy(accs, self.decoy_regex) # Assuming that after a peptide line, always the next line is # the line with flanking Aa # q4_p2_terms=E,K:E,K:E,K:E,K:E,K line = dat_file.next().strip() # (('E', 'K'), ('E', 'K'), ('E', 'K')) flanking_pairs = set(tuple(x.split(',')) for x in line.split("=")[1].split(":")) for flanking_pair in flanking_pairs: annotation = '.'.join(( flanking_pair[0], peptide, flanking_pair[1])) annotation = annotation.replace('-', '*') nativeid_psms[nativeid].append(dict( Annotation=annotation, IsDecoy=is_decoy)) elif line.strip() == SEPARATOR: return nativeid_psms def _parse_spectra(self, datfile): """Gets the data coming from the input file from Mascot dat format """ nativeid_spectra = defaultdict(dict) nativeid = 1 for line in datfile: nativeid_str = ''.join(("query", str(nativeid))) query_line = SECTION_TEMPLATE.substitute(section=nativeid_str) if line.strip() == query_line: current_id_mgf = nativeid_spectra[nativeid] nativeid += 1 elif line.startswith("title="): line = line.strip() #TODO: handle fractions #fraction = int(re.search(fraction_regex, line).group(1)) #current_id_mgf["Fraction"] = fraction try: quoted_rawfn = re.search(rawfn_regex, line).group(1) except AttributeError: sys.exit("It seems there is no raw: field in TITLE") rawfn = urllib.unquote(quoted_rawfn) current_id_mgf["RawFile"] = rawfn if self.scans_in_title: try: scan = int(re.search(self.scan_regex, line).group(1)) except AttributeError: sys.exit("Default scan regex for TITLE field" " doesn't match anything") current_id_mgf['Scan'] = scan elif line.startswith("charge="): charge = line.strip().split('=')[1] current_id_mgf['Charge'] = charge elif not self.scans_in_title and line.startswith("scans="): scan = int(line.strip().split('=')[2]) current_id_mgf['Scan'] = scan return nativeid_spectra def parse(self, dat_file): """Takes a dat file and returns a dictionary of psms. """ pkl_fn = self._parse_spectra_fn(dat_file) frag_mode = self._parse_frag_mode(dat_file) nativeid_psms = self._parse_psms(dat_file) nativeid_spectra = self._parse_spectra(datfile) for native_id, annotations in nativeid_annotations.items(): for annotation in annotations: annotation['#SpectraFile'] = pkl_fn annotation['FragMode'] = frag_mode annotation.update(nativeid_spectra[native_id]) yield annotation def apply_mods(peptide_str, mod_str): """Takes a peptide string and applies mascot modification string. It only supports C+57. """ mod_str = mod_str[1:-1] return ''.join((match != '0' and (p + '+57') or p for p, match in zip(peptide_str, mod_str))) def check_decoy(accs, decoy_regex): """Returns True if there is at least one target protein in Mascot protein accession string. """ prots = (y[1] for y in (x.split('"') for x in accs.split(','))) for prot in prots: match = re.search(decoy_regex, prot) if not match: return False return True def write_csv(psms, out_file): """Takes a dictionary of PSMs and writes a modified InsPecT output. """ fieldnames = ("#SpectraFile", "RawFile", "Fraction", "Scan", "Annotation", "Charge", "FragMode", "IsDecoy") header = dict() for fieldname in fieldnames: header[fieldname] = fieldname writer = csv.DictWriter(out_file, fieldnames, dialect='excel-tab') for psm in psms: writer.writerow(psm) def parse_dats(dats_dir, proteins_factory=dict, peptides_factory=dict, spectra_factory=dict, psms_factory=dict): """Parses all the dat files in dats_dir """ proteins = proteins_factory() peptides = peptides_factory() spectra = spectra_factory() psms = psms_factory() dat_parser = DatParser() for root, dirs, files in os.walk(dats_dir): for dat_fn in