1 | """Models for mzcms. |
---|
2 | |
---|
3 | """ |
---|
4 | import time |
---|
5 | from datetime import datetime |
---|
6 | import textwrap |
---|
7 | from itertools import combinations |
---|
8 | |
---|
9 | from persistent import Persistent |
---|
10 | from repoze.folder import Folder |
---|
11 | from repoze.catalog.catalog import Catalog |
---|
12 | from repoze.catalog.document import DocumentMap |
---|
13 | from repoze.catalog.indexes.field import CatalogFieldIndex |
---|
14 | from repoze.catalog.indexes.path2 import CatalogPathIndex2 |
---|
15 | from repoze.bfg.traversal import model_path |
---|
16 | |
---|
17 | from parsers import parse_dats |
---|
18 | from msquant import parse_msq_files |
---|
19 | |
---|
20 | |
---|
21 | # XXX: move constant masses to another module |
---|
22 | PROTON = 1.00727646677 |
---|
23 | |
---|
24 | class Experiment(Folder): |
---|
25 | """A proteomics experiment""" |
---|
26 | __parent__ = __name__ = None |
---|
27 | def __init__(self): |
---|
28 | self.catalog = Catalog() |
---|
29 | self.catalog['max_score'] = CatalogFieldIndex(get_max_score) |
---|
30 | self.catalog['path'] = CatalogPathIndex2(get_path) |
---|
31 | self.catalog.document_map = DocumentMap() |
---|
32 | super(Experiment, self).__init__() |
---|
33 | |
---|
34 | def update_indexes(self): |
---|
35 | catalog = self.catalog |
---|
36 | |
---|
37 | for data in self.values(): |
---|
38 | docid = catalog.document_map.add(model_path(data)) |
---|
39 | catalog.index_doc(docid, data) |
---|
40 | |
---|
41 | for prot in self['proteins'].values(): |
---|
42 | docid = catalog.document_map.add(model_path(prot)) |
---|
43 | catalog.index_doc(docid, prot) |
---|
44 | |
---|
45 | for pep in self['peptides'].values(): |
---|
46 | docid = catalog.document_map.add(model_path(pep)) |
---|
47 | catalog.index_doc(docid, pep) |
---|
48 | |
---|
49 | # Test of catalog for peptides. This will be moved somewhere else. |
---|
50 | def get_max_score(object, default): |
---|
51 | return getattr(object, 'max_score', float()) |
---|
52 | |
---|
53 | def get_path(object, default): |
---|
54 | return model_path(object) |
---|
55 | |
---|
56 | class Proteins(Folder): |
---|
57 | """A protein container""" |
---|
58 | def update(self, protein_container): |
---|
59 | """Updates the protein container so that if the protein is |
---|
60 | already in the container only the peptide references are |
---|
61 | added. |
---|
62 | """ |
---|
63 | for prot_id, prot in protein_container.items(): |
---|
64 | if prot_id in self: |
---|
65 | # XXX: Test this is done correctly |
---|
66 | self[prot_id].pep_refs.update(prot.pep_refs) |
---|
67 | else: |
---|
68 | self[prot_id] = prot |
---|
69 | |
---|
70 | class Protein(Persistent): |
---|
71 | """A Protein""" |
---|
72 | def __init__(self, native_id, sequence, pep_refs, |
---|
73 | ext_ids=None, description=None, ratios=None, |
---|
74 | max_score=None, coverage=None, covered_seq=None): |
---|
75 | self.native_id = str(native_id) |
---|
76 | self.sequence = str(sequence) |
---|
77 | self.ext_ids = ext_ids |
---|
78 | self.description = description |
---|
79 | self.pep_refs = pep_refs |
---|
80 | self.ratios = ratios |
---|
81 | self.coverage = coverage |
---|
82 | self.covered_seq = None |
---|
83 | |
---|
84 | def update_coverage(self): |
---|
85 | cov_seq = self.sequence[:] |
---|
86 | for pep in self.pep_refs: |
---|
87 | new_seq = list() |
---|
88 | seq = self.sequence.replace(pep.naked_seq, |
---|
89 | 'x' * len(pep.naked_seq)) |
---|
90 | for s, c in zip(seq, cov_seq): |
---|
91 | if s == 'x': |
---|
92 | new_seq.append('x') |
---|
93 | else: |
---|
94 | new_seq.append(c) |
---|
95 | cov_seq = ''.join(new_seq) |
---|
96 | self.covered_seq = cov_seq |
---|
97 | |
---|
98 | @property |
---|
99 | def coverage_seq(self): |
---|
100 | coverage = float(self.covered_seq.count('x'))/len(self.sequence) |
---|
101 | return round(coverage * 100, 2) |
---|
102 | |
---|
103 | def update_pep_num(self): |
---|
104 | self.pep_num = len(self.pep_refs) |
---|
105 | |
---|
106 | def update_psm_num(self): |
---|
107 | psm_count = int() |
---|
108 | for pep in self.pep_refs: |
---|
109 | psm_count += pep.psm_num |
---|
110 | self.psm_num = psm_count |
---|
111 | |
---|
112 | def formatted_seq(self): |
---|
113 | # TODO: Make sure this doesn't screw up in windows |
---|
114 | # Maybe this is better in the view |
---|
115 | return '\n'.join(textwrap.wrap(self.sequence)) |
---|
116 | |
---|
117 | def update_ratios(self): |
---|
118 | ratios = list() |
---|
119 | for pep in self.pep_refs: |
---|
120 | for psm in pep.psm_refs: |
---|
121 | if psm.ratios: |
---|
122 | ratios.append(psm.ratios) |
---|
123 | if len(ratios) > 0: |
---|
124 | il_ratio = sum(r[0] for r in ratios)/len(ratios) |
---|
125 | lh_ratio = sum(r[1] for r in ratios)/len(ratios) |
---|
126 | hi_ratio = sum(r[2] for r in ratios)/len(ratios) |
---|
127 | self.ratios = il_ratio, lh_ratio, hi_ratio |
---|
128 | |
---|
129 | def update_max_score(self): |
---|
130 | """Checks what is the maximum score in all peptide references. |
---|
131 | """ |
---|
132 | max_score = float() |
---|
133 | for pep in self.pep_refs: |
---|
134 | if pep.max_score > max_score: |
---|
135 | max_score = pep.max_score |
---|
136 | self.max_score = max_score |
---|
137 | |
---|
138 | class Peptide(Persistent): |
---|
139 | """A peptide""" |
---|
140 | def __init__(self, sequence, mass, psm_refs=None, max_score=None): |
---|
141 | self.sequence = tuple(sequence) |
---|
142 | self.naked_seq = ''.join((x[0] for x in sequence[1:-1])) |
---|
143 | self.mass = float(mass) |
---|
144 | self.length = len(sequence) |
---|
145 | self.psm_refs = psm_refs |
---|
146 | self.max_score = max_score |
---|
147 | |
---|
148 | def update_psm_num(self): |
---|
149 | self.psm_num = len(self.psm_refs) |
---|
150 | |
---|
151 | def update_max_score(self): |
---|
152 | self.max_score = max((p.score for p in self.psm_refs)) |
---|
153 | |
---|
154 | def formatted_seq(self): |
---|
155 | seq = str() |
---|
156 | for aa, m in self.sequence: |
---|
157 | if m: |
---|
158 | f_aa = ''.join((m[1], aa)) |
---|
159 | else: |
---|
160 | f_aa = aa |
---|
161 | seq = ''.join((seq, f_aa)) |
---|
162 | return seq |
---|
163 | |
---|
164 | def was_quanted(self): |
---|
165 | for psm in self.psm_refs: |
---|
166 | if len(psm.quant_feat) >= 1: |
---|
167 | return True |
---|
168 | return False |
---|
169 | |
---|
170 | def formatted_ptms(self): |
---|
171 | return ', '.join((x[1][1] for x in self.sequence |
---|
172 | if x[1] is not None)) |
---|
173 | |
---|
174 | class Spectrum(Persistent): |
---|
175 | """A Spectrum""" |
---|
176 | def __init__(self, scan, run, peaks, prec_mz, prec_charge, |
---|
177 | psm_refs=None): |
---|
178 | self.scan = int(scan) |
---|
179 | self.run = str(run) |
---|
180 | self.peaks = tuple(peaks) |
---|
181 | self.prec_mz = float(prec_mz) |
---|
182 | self.prec_charge = int(prec_charge) |
---|
183 | self.psm_refs = psm_refs or set() |
---|
184 | |
---|
185 | class Psm(Persistent): |
---|
186 | """A Peptide to Spectrum Match""" |
---|
187 | def __init__(self, score, rank, delta_mass, pep_ref, spec_ref, |
---|
188 | params_ref=None, mme=None, quant_feat=None, xics=None, |
---|
189 | ratios=None): |
---|
190 | self.score = float(score) |
---|
191 | self.rank = int(rank) |
---|
192 | self.delta_mass = float(delta_mass) |
---|
193 | # mass measurement error |
---|
194 | self.mme = mme |
---|
195 | #self.delta_score = float(delta_score) |
---|
196 | #self.sequence_rank2 = str(sequence_rank2) |
---|
197 | self.pep_ref = pep_ref |
---|
198 | self.spec_ref = spec_ref |
---|
199 | self.params_ref = params_ref |
---|
200 | self.quant_feat = quant_feat or list() |
---|
201 | self.xics = xics |
---|
202 | self.ratios = ratios |
---|
203 | |
---|
204 | def update_mme(self): |
---|
205 | charge= self.spec_ref.prec_charge |
---|
206 | theo_mass = self.pep_ref.mass |
---|
207 | theo_mz = (theo_mass + charge * PROTON) / charge |
---|
208 | exp_mz = self.spec_ref.prec_mz |
---|
209 | ratio = exp_mz/theo_mz |
---|
210 | if ratio > 1.0: |
---|
211 | mme = 10**6 * (ratio - 1.0) |
---|
212 | elif ratio < 1.0: |
---|
213 | mme = -10**6 * (1.0 - ratio) |
---|
214 | elif ratio == 1.0: |
---|
215 | mme = 0 |
---|
216 | self.mme = mme |
---|
217 | |
---|
218 | def was_quanted(self): |
---|
219 | if len(self.quant_feat) >= 1: |
---|
220 | return True |
---|
221 | return False |
---|
222 | |
---|
223 | def update_ratios(self): |
---|
224 | ratios = list() |
---|
225 | if self.xics: |
---|
226 | for comb in combinations(self.xics, 2): |
---|
227 | try: |
---|
228 | # IL, HL, HI |
---|
229 | ratios.append(comb[1]/comb[0]) |
---|
230 | except ZeroDivisionError: |
---|
231 | ratios.append(float()) |
---|
232 | self.ratios = tuple(ratios) |
---|
233 | |
---|
234 | class Ptm(Persistent): |
---|
235 | """A Post-tranlational modification""" |
---|
236 | def __init__(self, mass, name): |
---|
237 | self.name = name |
---|
238 | self.mass = float(mass) |
---|
239 | |
---|
240 | class SerParameters(Persistent): |
---|
241 | """A set of Search Engine Results parameters""" |
---|
242 | def __init__(self, |
---|
243 | frag_mode='CID', |
---|
244 | db_name=None, |
---|
245 | pkl_fn=None, |
---|
246 | enzyme='Trypsin', |
---|
247 | prec_tol=(0.6, 'Da'), |
---|
248 | frag_tol=(50.0, 'ppm'), |
---|
249 | quantitation=None, |
---|
250 | program='Mascot', |
---|
251 | version='2.2.04', |
---|
252 | date=time.time(), |
---|
253 | ): |
---|
254 | self.frag_mode=frag_mode |
---|
255 | self.db_name = db_name |
---|
256 | self.pkl_fn = pkl_fn |
---|
257 | self.enzyme = enzyme |
---|
258 | self.prec_tol = prec_tol |
---|
259 | self.frag_tol = frag_tol |
---|
260 | self.quantitation = quantitation |
---|
261 | self.program = program |
---|
262 | self.version = version |
---|
263 | self.date = datetime.fromtimestamp(date) |
---|
264 | |
---|
265 | def update_psms(psms): |
---|
266 | for psm in psms.values(): |
---|
267 | psm.update_mme() |
---|
268 | psm.update_ratios() |
---|
269 | |
---|
270 | def update_peptides(peptides): |
---|
271 | """Updates the number of psms in all peptides""" |
---|
272 | for peptide in peptides.values(): |
---|
273 | peptide.update_psm_num() |
---|
274 | peptide.update_max_score() |
---|
275 | |
---|
276 | |
---|
277 | #TODO: Move this to another module |
---|
278 | def update_proteins(proteins): |
---|
279 | """Updates the number of psms and peptides in all proteins. |
---|
280 | """ |
---|
281 | for protein in proteins.values(): |
---|
282 | protein.update_pep_num() |
---|
283 | protein.update_psm_num() |
---|
284 | protein.update_coverage() |
---|
285 | protein.update_ratios() |
---|
286 | protein.update_max_score() |
---|
287 | |
---|
288 | def appmaker(zodb_root): |
---|
289 | if not 'app_root' in zodb_root: |
---|
290 | p_t = time.time() |
---|
291 | app_root = Experiment() |
---|
292 | proteins = Proteins() |
---|
293 | peptides = Folder() |
---|
294 | spectra = Folder() |
---|
295 | psms = Folder() |
---|
296 | zodb_root['app_root'] = app_root |
---|
297 | app_root['proteins'] = proteins |
---|
298 | app_root['peptides'] = peptides |
---|
299 | app_root['spectra'] = spectra |
---|
300 | app_root['psms'] = psms |
---|
301 | |
---|
302 | parse_dats('./min-dats', |
---|
303 | proteins_container=proteins, |
---|
304 | peptides_container=peptides, |
---|
305 | spectra_container=spectra, |
---|
306 | psms_container=psms, |
---|
307 | protein_factory=Protein, |
---|
308 | peptide_factory=Peptide, |
---|
309 | spectrum_factory=Spectrum, |
---|
310 | psm_factory=Psm, |
---|
311 | serparam_factory=SerParameters, |
---|
312 | ) |
---|
313 | import transaction |
---|
314 | transaction.commit() |
---|
315 | m_t = time.time() |
---|
316 | print(u"Time to parse dat files {0} secs.".format((m_t - p_t))) |
---|
317 | p_t = m_t |
---|
318 | |
---|
319 | parse_msq_files(u"./min-dats", psms_container=psms, |
---|
320 | spectra_container=spectra) |
---|
321 | transaction.commit() |
---|
322 | m_t = time.time() |
---|
323 | print(u"Time to parse MsQuant output: {0} secs.".format((m_t - p_t))) |
---|
324 | p_t = m_t |
---|
325 | |
---|
326 | update_psms(psms) |
---|
327 | transaction.commit() |
---|
328 | m_t = time.time() |
---|
329 | print(u"Time to update PSMs: {0} secs.".format((m_t - p_t))) |
---|
330 | p_t = m_t |
---|
331 | |
---|
332 | update_peptides(peptides) |
---|
333 | transaction.commit() |
---|
334 | m_t = time.time() |
---|
335 | print(u"Time to update peptides: {0} secs.".format((m_t - p_t))) |
---|
336 | p_t = m_t |
---|
337 | |
---|
338 | update_proteins(proteins) |
---|
339 | transaction.commit() |
---|
340 | m_t = time.time() |
---|
341 | print(u"Time to update proteins: {0} secs.".format((m_t - p_t))) |
---|
342 | p_t = m_t |
---|
343 | |
---|
344 | app_root.update_indexes() |
---|
345 | transaction.commit() |
---|
346 | m_t = time.time() |
---|
347 | print(u"Time to update indexes: {0} secs.".format((m_t - p_t))) |
---|
348 | p_t = m_t |
---|
349 | |
---|
350 | print(u"Finished building") |
---|
351 | |
---|
352 | return zodb_root['app_root'] |
---|