1 | package nl.tno.massSequencing |
---|
2 | |
---|
3 | import java.io.BufferedWriter; |
---|
4 | import java.io.File; |
---|
5 | import java.io.Writer; |
---|
6 | import java.util.ArrayList; |
---|
7 | import java.util.List; |
---|
8 | import org.codehaus.groovy.grails.commons.ConfigurationHolder |
---|
9 | import java.util.zip.* |
---|
10 | |
---|
11 | class FastaService implements nl.tno.massSequencing.imports.Importer { |
---|
12 | def fileService |
---|
13 | def fuzzySearchService |
---|
14 | def sampleExcelService |
---|
15 | def excelService |
---|
16 | |
---|
17 | static transactional = false |
---|
18 | |
---|
19 | /** |
---|
20 | * Determines whether a file can be processed. |
---|
21 | * @param filetype Filetype of the file |
---|
22 | * @see determineFileType() |
---|
23 | * @return |
---|
24 | */ |
---|
25 | public boolean canParseFileType( String filetype ) { |
---|
26 | switch( filetype ) { |
---|
27 | case "fasta": |
---|
28 | case "qual": |
---|
29 | return true; |
---|
30 | default: |
---|
31 | return false; |
---|
32 | } |
---|
33 | } |
---|
34 | |
---|
35 | /** |
---|
36 | * Parses the given FASTA or QUAL file |
---|
37 | * |
---|
38 | * @param file File to parse |
---|
39 | * @param filetype Type of the given file |
---|
40 | * @param onProgress Closure to execute when progress indicators should be updated. |
---|
41 | * Has 2 parameters: progress that indicates the number of bytes that have been processed. The second parameter determines |
---|
42 | * the number of bytes that has to be processed extra after this action (e.g. when a zip file is extracted, the extracted |
---|
43 | * files should be added to the total number of bytes to be processed) |
---|
44 | * @return Structure with information about the parsed files. The 'success' files are |
---|
45 | * moved to the given directory |
---|
46 | * |
---|
47 | * Examples: |
---|
48 | * [filename: 'abc.fasta', type: FASTA, numSequences: 190] |
---|
49 | * [filename: 'cde.qual', type: QUAL, numSequences: 140, avgQuality: 29] |
---|
50 | * |
---|
51 | * [filename: 'test.zip', type: ZIP, extraFiles: [newfile1.xls, newfile2.xls, newfile3.xls]] |
---|
52 | * |
---|
53 | * [filename: 'testing.xls', type: 'unknown', message: 'Type not recognized'] |
---|
54 | * |
---|
55 | */ |
---|
56 | |
---|
57 | public Map parseFile( File file, String filetype, Closure onProgress ) { |
---|
58 | switch( filetype ) { |
---|
59 | case "fasta": |
---|
60 | return parseFasta( file, onProgress ); |
---|
61 | case "qual": |
---|
62 | return parseQual( file, onProgress ); |
---|
63 | default: |
---|
64 | return [ success: false, type: filetype, message: 'Filetype could not be parsed.' ] |
---|
65 | } |
---|
66 | } |
---|
67 | |
---|
68 | /** |
---|
69 | * Parses the given FASTA file |
---|
70 | * @param file File to parse |
---|
71 | * @param onProgress Closure to execute when progress indicators should be updated. |
---|
72 | * Has 2 parameters: numFilesProcessed and numBytesProcessed that indicate the number |
---|
73 | * of files and bytes that have been processed in this file (so the first parameter should |
---|
74 | * only be 1 when the file is finished) |
---|
75 | * @return List structure. Examples: |
---|
76 | * |
---|
77 | * [ success: true, filename: 'abc.fasta', type: 'fasta', numSequences: 200 ] |
---|
78 | * [ success: false, filename: 'def.fasta', type: 'fasta', message: 'File is not a valid FASTA file' ] |
---|
79 | */ |
---|
80 | protected def parseFasta( File file, Closure onProgress ) { |
---|
81 | |
---|
82 | long startTime = System.nanoTime(); |
---|
83 | log.trace "Start parsing FASTA " + file.getName() |
---|
84 | |
---|
85 | // Count the number of lines, starting with '>' (and where the following line contains a character other than '>') |
---|
86 | long numSequences = 0; |
---|
87 | long bytesProcessed = 0; |
---|
88 | boolean lookingForSequence = false; |
---|
89 | |
---|
90 | file.eachLine { line -> |
---|
91 | if( line ) { |
---|
92 | if( !lookingForSequence && line[0] == '>' ) { |
---|
93 | lookingForSequence = true; |
---|
94 | } else if( lookingForSequence ) { |
---|
95 | if( line[0] != '>' ) { |
---|
96 | numSequences++; |
---|
97 | lookingForSequence = false; |
---|
98 | } |
---|
99 | } |
---|
100 | |
---|
101 | |
---|
102 | // Update progress every time 1MB is processed |
---|
103 | bytesProcessed += line.size(); |
---|
104 | if( bytesProcessed > 1000000 ) { |
---|
105 | onProgress( bytesProcessed, 0 ); |
---|
106 | bytesProcessed = 0; |
---|
107 | } |
---|
108 | } |
---|
109 | |
---|
110 | } |
---|
111 | |
---|
112 | // Update progress and say we're finished |
---|
113 | onProgress( bytesProcessed, 0 ); |
---|
114 | |
---|
115 | log.trace "Finished parsing FASTA " + file.getName() + ": " + ( System.nanoTime() - startTime ) / 1000000L |
---|
116 | |
---|
117 | return [ success: true, type: "fasta", filename: file.getName(), numSequences: numSequences ]; |
---|
118 | } |
---|
119 | |
---|
120 | /** |
---|
121 | * Parses the given FASTA file and returns a list with sequence lengths |
---|
122 | * @param file File to parse |
---|
123 | * @return List [ 400: 12, 410: 15, 'unknown': 14 ] |
---|
124 | */ |
---|
125 | protected def parseFastaLengths( File file ) { |
---|
126 | |
---|
127 | long startTime = System.nanoTime(); |
---|
128 | log.trace "Start parsing FASTA " + file.getName() |
---|
129 | |
---|
130 | // Count the number of lines, starting with '>' (and where the following line contains a character other than '>') |
---|
131 | def histogram = [:] |
---|
132 | def lengthPattern = ~/length=(\d+)/ |
---|
133 | def length |
---|
134 | def lengthMatches |
---|
135 | try { |
---|
136 | file.eachLine { line -> |
---|
137 | if( line ) { |
---|
138 | if( line[0] == '>' ) { |
---|
139 | // Comments line: find length=### |
---|
140 | lengthMatches = ( line =~ lengthPattern ); |
---|
141 | if( lengthMatches ) { |
---|
142 | length = Integer.valueOf( lengthMatches[0][1] ); |
---|
143 | } else { |
---|
144 | length = 'unknown' |
---|
145 | } |
---|
146 | |
---|
147 | histogram[ length ] = ( histogram[length] ?: 0 ) + 1; |
---|
148 | } |
---|
149 | } |
---|
150 | } |
---|
151 | } catch( Exception e ) { |
---|
152 | log.error( "Exception while reading fasta file " + file + ": " + e.getMessage() ) |
---|
153 | } |
---|
154 | |
---|
155 | log.trace "Finished parsing FASTA " + file.getName() + ": " + ( System.nanoTime() - startTime ) / 1000000L |
---|
156 | |
---|
157 | return histogram; |
---|
158 | } |
---|
159 | |
---|
160 | /** |
---|
161 | * Parses the given QUAL file |
---|
162 | * @param file File to parse |
---|
163 | * @param onProgress Closure to execute when progress indicators should be updated. |
---|
164 | * Has 2 parameters: numFilesProcessed and numBytesProcessed that indicate the number |
---|
165 | * of files and bytes that have been processed in this file (so the first parameter should |
---|
166 | * only be 1 when the file is finished) |
---|
167 | * @return List structure. Examples: |
---|
168 | * |
---|
169 | * [ success: true, filename: 'abc.qual', type: 'qual', numSequences: 200, avgQuality: 31 ] |
---|
170 | * [ success: false, filename: 'def.qual', type: 'qual', message: 'File is not a valid QUAL file' ] |
---|
171 | */ |
---|
172 | protected def parseQual( File file, Closure onProgress ) { |
---|
173 | long startTime = System.nanoTime(); |
---|
174 | log.trace "Start parsing QUAL " + file.getName() |
---|
175 | |
---|
176 | // Count the number of lines, starting with '>'. After we've found such a character, we continue looking for |
---|
177 | // quality scores |
---|
178 | long numSequences = 0; |
---|
179 | long bytesProcessed = 0; |
---|
180 | def quality = [ quality: 0.0, number: 0L ] |
---|
181 | |
---|
182 | boolean lookingForFirstQualityScores = false; |
---|
183 | file.eachLine { line -> |
---|
184 | if( line ) { |
---|
185 | if( !lookingForFirstQualityScores && line[0] == '>' ) { |
---|
186 | lookingForFirstQualityScores = true; |
---|
187 | } else if( lookingForFirstQualityScores ) { |
---|
188 | if( line[0] != '>' ) { |
---|
189 | numSequences++; |
---|
190 | lookingForFirstQualityScores = false; |
---|
191 | |
---|
192 | // Don't compute average quality because it takes too much time |
---|
193 | //quality = updateQuality( quality, line ); |
---|
194 | } |
---|
195 | } else { |
---|
196 | // Don't compute average quality because it takes too much time |
---|
197 | //quality = updateQuality( quality, line ); |
---|
198 | } |
---|
199 | |
---|
200 | // Update progress every time 1MB is processed |
---|
201 | bytesProcessed += line.size(); |
---|
202 | if( bytesProcessed > 1000000 ) { |
---|
203 | onProgress( bytesProcessed, 0 ); |
---|
204 | bytesProcessed = 0; |
---|
205 | } |
---|
206 | } |
---|
207 | |
---|
208 | } |
---|
209 | |
---|
210 | // Update progress and say we're finished |
---|
211 | onProgress( bytesProcessed, 0 ); |
---|
212 | |
---|
213 | log.trace "Finished parsing QUAL " + file.getName() + ": " + ( System.nanoTime() - startTime ) / 1000000L |
---|
214 | |
---|
215 | return [ success: true, type: "qual", filename: file.getName(), numSequences: numSequences, avgQuality: quality.quality ]; |
---|
216 | } |
---|
217 | |
---|
218 | |
---|
219 | /** |
---|
220 | * Matches uploaded fasta and qual files and combines them with the samples they probably belong to |
---|
221 | * @param parsedFiles Parsed files |
---|
222 | * [ |
---|
223 | * [filename: 'abc.fasta', type: FASTA, numSequences: 190] |
---|
224 | * [filename: 'cde.fasta', type: FASTA, numSequences: 140] |
---|
225 | * [filename: 'abc.qual', type: QUAL, numSequences: 190, avgQuality: 38] |
---|
226 | * [filename: 'cde.qual', type: QUAL, numSequences: 140, avgQuality: 29] |
---|
227 | * [filename: 'match.xls', type: EXCEL, matches: [ [ filename: 'abc.fasta', basename: 'abc', sample: 's1' ] ] |
---|
228 | * ] |
---|
229 | * @param samples AssaySample objects to which the files should be matched. |
---|
230 | * @return Structure with information about the matching. |
---|
231 | * [ |
---|
232 | * [ |
---|
233 | * fasta: [filename: 'abc.fasta', type: FASTA, numSequences: 190], |
---|
234 | * qual: [filename: 'abc.qual', type: QUAL, numSequences: 190, avgQuality: 38], |
---|
235 | * feasibleQuals: [ |
---|
236 | * [filename: 'abc.qual', type: QUAL, numSequences: 190, avgQuality: 38], |
---|
237 | * [filename: 'def.qual', type: QUAL, numSequences: 190, avgQuality: 21] |
---|
238 | * ] |
---|
239 | * sample: AssaySample object |
---|
240 | */ |
---|
241 | def matchFiles( def parsedFiles, def samples ) { |
---|
242 | def fastas = parsedFiles.findAll { it.type == "fasta" } |
---|
243 | def quals = parsedFiles.findAll { it.type == "qual" } |
---|
244 | def excels = parsedFiles.findAll { it.type == "excel" } |
---|
245 | |
---|
246 | samples = samples.toList() |
---|
247 | |
---|
248 | // Collect (filename) matches from all excel files |
---|
249 | def matches = []; |
---|
250 | excels.each { m -> |
---|
251 | if( m.matches && m.matches.filenames ) { |
---|
252 | // [ filename: filename, samplename: GSCFSample, basename: basename ] |
---|
253 | m.matches.filenames.each { matches << it } |
---|
254 | } |
---|
255 | } |
---|
256 | |
---|
257 | def files = []; |
---|
258 | |
---|
259 | fastas.each { fastaFile -> |
---|
260 | // Remove extension |
---|
261 | def matchWith = fastaFile.originalfilename.substring( 0, fastaFile.originalfilename.lastIndexOf( '.' ) ) |
---|
262 | |
---|
263 | // Determine feasible quals (based on number of sequences ) |
---|
264 | def feasibleQuals = quals.findAll { it.numSequences == fastaFile.numSequences } |
---|
265 | |
---|
266 | // Best matching qual file |
---|
267 | def qualIdx = fuzzySearchService.mostSimilarWithIndex( matchWith + '.qual', feasibleQuals.originalfilename ); |
---|
268 | |
---|
269 | def qual = null |
---|
270 | if( qualIdx != null ) |
---|
271 | qual = feasibleQuals[ qualIdx ]; |
---|
272 | |
---|
273 | // Best matching sample |
---|
274 | def assaySample = null |
---|
275 | def checked = true; |
---|
276 | if( matches ) { |
---|
277 | // Match with files from excelsheet |
---|
278 | |
---|
279 | // First find the best matching filename in the list of matches. |
---|
280 | def sampleNameIdx = fuzzySearchService.mostSimilarWithIndex( matchWith, matches*.basename, 0.8 ); |
---|
281 | |
---|
282 | // If one is found, use the sample name associated with it to do the matching with samples |
---|
283 | if( sampleNameIdx != null ) { |
---|
284 | matchWith = matches[ sampleNameIdx ].samplename; |
---|
285 | } else { |
---|
286 | // If no match is found in the excel sheet, this sample is unchecked by default |
---|
287 | checked = false; |
---|
288 | } |
---|
289 | } |
---|
290 | |
---|
291 | // Match on filenames |
---|
292 | def sampleIdx = fuzzySearchService.mostSimilarWithIndex( matchWith, samples.sample.name ); |
---|
293 | if( sampleIdx != null ) { |
---|
294 | assaySample = samples[ sampleIdx ]; |
---|
295 | } |
---|
296 | |
---|
297 | files << [ |
---|
298 | fasta: fastaFile, |
---|
299 | feasibleQuals: feasibleQuals, |
---|
300 | qual: qual, |
---|
301 | assaySample: assaySample, |
---|
302 | checked: checked |
---|
303 | ] |
---|
304 | } |
---|
305 | |
---|
306 | return files; |
---|
307 | |
---|
308 | |
---|
309 | } |
---|
310 | |
---|
311 | |
---|
312 | /** |
---|
313 | * Parses a given excel file with a match between filenames and samples |
---|
314 | * @param file File to parse |
---|
315 | * @param onProgress Closure to execute when progress indicators should be updated. |
---|
316 | * Has 2 parameters: numFilesProcessed and numBytesProcessed that indicate the number |
---|
317 | * of files and bytes that have been processed in this file (so the first parameter should |
---|
318 | * only be 1 when the file is finished) |
---|
319 | * @return List structure. The matches array contains an array of matches between filenames and sample(name)s. |
---|
320 | * The extension for all files are removed in the 'basename' parameter, in order to improve matching. |
---|
321 | * Examples: |
---|
322 | * |
---|
323 | * [ success: true, filename: 'abc.xls', type: 'excel', matches: [ [ filename: 's1.qual', basename: 's1', sample: 'sample a' ], [ filename: 's9.fna', basename: 's9', sample: 'sample b' ] ] |
---|
324 | * [ success: false, filename: 'def.xls', type: 'excel', message: 'File is not a valid XLS file' ] |
---|
325 | */ |
---|
326 | public def inferExcelMatches( parsedFiles ) { |
---|
327 | // Loop through all excel files, and determine which (usable) columns are in the file |
---|
328 | for( parsedFile in parsedFiles ) { |
---|
329 | if( parsedFile.type == "excel" ) { |
---|
330 | |
---|
331 | def header = parsedFile.header; |
---|
332 | def data = parsedFile.data |
---|
333 | |
---|
334 | // Check whether (at least) 2 columns are present |
---|
335 | if( header.size() < 2 ) { |
---|
336 | parsedFile.success = false |
---|
337 | parsedFile.exceldata = [] |
---|
338 | parsedFile.message = "Excel file must contain at least 2 columns, one with filenames and one with sample names" |
---|
339 | continue; |
---|
340 | } |
---|
341 | |
---|
342 | // Check the headers to see whether any columns are found |
---|
343 | def columns = [:] |
---|
344 | def matches = [ "filenames": [], "mothursamples": [] ] |
---|
345 | |
---|
346 | header.eachWithIndex { cell, i -> |
---|
347 | switch( cell?.toLowerCase() ) { |
---|
348 | case "gscfsample": |
---|
349 | case "sample": |
---|
350 | columns[ "GSCFSample" ] = i; break; |
---|
351 | case "file": |
---|
352 | case "filename": |
---|
353 | columns[ "filename" ] = i; break; |
---|
354 | case "mothursample": |
---|
355 | columns[ "MothurSample" ] = i; break; |
---|
356 | } |
---|
357 | } |
---|
358 | |
---|
359 | // Loop through the data and find matches |
---|
360 | data.each { row -> |
---|
361 | def filename = columns[ "filename" ] != null ? row[ columns[ "filename" ] ] : null; |
---|
362 | def GSCFSample = columns[ "GSCFSample" ] != null ? row[ columns[ "GSCFSample" ] ] : null; |
---|
363 | def mothurSample = columns[ "MothurSample" ] != null ? row[ columns[ "MothurSample" ] ] : null; |
---|
364 | |
---|
365 | // Matches only exist if GSCF sample is entered in this row |
---|
366 | if( GSCFSample && GSCFSample != "null" ) { |
---|
367 | if( filename && filename != "null" ) { |
---|
368 | // Remove extension from the filename, but only if it is |
---|
369 | // .fna, .fasta, .qual, .fqa, .xls or .xlsx. Otherwise useful parts of the filename would be removed. |
---|
370 | def basename = ( filename =~ /\.(fna|fasta|qual|fqa|xls|xlsx)$/ ).replaceAll( "" ); |
---|
371 | |
---|
372 | matches[ "filenames" ] << [ filename: filename, samplename: GSCFSample, basename: basename ]; |
---|
373 | } |
---|
374 | |
---|
375 | if( mothurSample && mothurSample != "null" ) { |
---|
376 | matches[ "mothursamples" ] << [ samplename: GSCFSample, mothurname: mothurSample ]; |
---|
377 | } |
---|
378 | } |
---|
379 | } |
---|
380 | |
---|
381 | parsedFile[ 'columns' ] = columns; |
---|
382 | parsedFile[ 'matches' ] = matches; |
---|
383 | } |
---|
384 | } |
---|
385 | |
---|
386 | return parsedFiles |
---|
387 | } |
---|
388 | |
---|
389 | /** |
---|
390 | * Parses the given line and updates the average quality based on the scores |
---|
391 | * @param quality [quality: 0.0f, number: 0L] |
---|
392 | * @param line String of integer quality scores, separated by a whitespace |
---|
393 | * @return [quality: 0.0f, number: 0L] |
---|
394 | */ |
---|
395 | protected def updateQuality( def quality, String line ) { |
---|
396 | // Determine current average |
---|
397 | List tokens = line.tokenize(); |
---|
398 | Long total = 0; |
---|
399 | |
---|
400 | tokens.each { |
---|
401 | total += Integer.parseInt( it ); |
---|
402 | } |
---|
403 | |
---|
404 | int numTokens = tokens.size(); |
---|
405 | |
---|
406 | // Update the given average |
---|
407 | if( numTokens > 0 ) { |
---|
408 | quality.number += numTokens; |
---|
409 | quality.quality = quality.quality + ( ( total / numTokens as double ) - quality.quality ) / quality.number * numTokens; |
---|
410 | } |
---|
411 | |
---|
412 | return quality |
---|
413 | } |
---|
414 | |
---|
415 | /** |
---|
416 | * Moves a fasta and qual file to their permanent location, and returns information about these files |
---|
417 | * |
---|
418 | * @param fastaFile Filename of the fasta file in the temporary directory |
---|
419 | * @param qualFile Filename of the fasta file in the temporary directory |
---|
420 | * @param processedFiles Structure with data about uploaded files and matches |
---|
421 | * @return [ fasta: <fasta filename>, qual: <qual filename>, numSequences: <number of sequences>, avgQuality: <average quality> ] |
---|
422 | */ |
---|
423 | public def savePermanent( String fastaFile, String qualFile, def processedFiles ) { |
---|
424 | File permanentDirectory = fileService.absolutePath( ConfigurationHolder.config.massSequencing.fileDir ); |
---|
425 | def returnStructure = [:]; |
---|
426 | |
---|
427 | if( fileService.fileExists( fastaFile ) ) { |
---|
428 | // Lookup the original filename |
---|
429 | def fastaData = processedFiles.parsed.success.find { it.filename == fastaFile }; |
---|
430 | if( fastaData ) { |
---|
431 | returnStructure.fasta = fileService.moveFileToUploadDir( fileService.get( fastaFile ), fastaData.originalfilename, permanentDirectory ); |
---|
432 | returnStructure.numSequences = fastaData.numSequences; |
---|
433 | } else { |
---|
434 | throw new Exception( "Fasta file wasn't uploaded the right way. Maybe the session has expired/" ); |
---|
435 | } |
---|
436 | } else { |
---|
437 | // If the file doesn't exist, we can't save anything to the database. |
---|
438 | throw new Exception( "Fasta file to save doesn't exist on disk" ); |
---|
439 | } |
---|
440 | |
---|
441 | if( qualFile && fileService.fileExists( qualFile ) ) { |
---|
442 | // Lookup the original filename |
---|
443 | def qualData = processedFiles.parsed.success.find { it.filename == qualFile }; |
---|
444 | |
---|
445 | if( qualData ) { |
---|
446 | returnStructure.qual = fileService.moveFileToUploadDir( fileService.get(qualFile ), qualData.originalfilename, permanentDirectory ); |
---|
447 | returnStructure.avgQuality = qualData.avgQuality |
---|
448 | } else { |
---|
449 | // Delete the uploaded fasta file, since this is a serious error |
---|
450 | fileService.delete( returnStructure.fasta, permanentDirectory ); |
---|
451 | throw new Exception( "Qual file wasn't uploaded the right way. Maybe the session has expired" ); |
---|
452 | } |
---|
453 | } else { |
---|
454 | // If the file doesn't exist, we don't save any quality information |
---|
455 | returnStructure.qual = null |
---|
456 | returnStructure.avgQuality = 0; |
---|
457 | } |
---|
458 | |
---|
459 | return returnStructure; |
---|
460 | } |
---|
461 | |
---|
462 | /* |
---|
463 | * Creates a histogram of sequence lengths for the given assaysamples |
---|
464 | * @param assaySamples List of assaySamples to create the histogram for |
---|
465 | * @return List List with length-number tuples. Example: |
---|
466 | * [ [400, 12], [402, 9] ] |
---|
467 | */ |
---|
468 | public List sequenceLengthHistogram( ArrayList assaySamples ) { |
---|
469 | if( !assaySamples || assaySamples.size() == 0 ) |
---|
470 | return []; |
---|
471 | |
---|
472 | // Determine the directory the uploaded files are stored in |
---|
473 | File permanentDirectory = fileService.absolutePath( ConfigurationHolder.config.massSequencing.fileDir ); |
---|
474 | |
---|
475 | // Initialize the histogram |
---|
476 | def histogram = [:]; |
---|
477 | |
---|
478 | // Loop through all samples and find the histogram values for each of them |
---|
479 | assaySamples.each { assaySample -> |
---|
480 | if( assaySample.numSequences() > 0 ) { |
---|
481 | assaySample.sequenceData.each { sequenceData -> |
---|
482 | File sequenceFile = fileService.get( sequenceData.sequenceFile, permanentDirectory ); |
---|
483 | |
---|
484 | parseFastaLengths( sequenceFile ).each { k, v -> |
---|
485 | histogram[ k ] = ( histogram[ k ] ?: 0 ) + v; |
---|
486 | }; |
---|
487 | } |
---|
488 | } |
---|
489 | } |
---|
490 | |
---|
491 | def lengthList = []; |
---|
492 | histogram.each { |
---|
493 | if( it.key != 'unknown' ) |
---|
494 | lengthList << [ it.key, it.value ] |
---|
495 | } |
---|
496 | |
---|
497 | return lengthList; |
---|
498 | } |
---|
499 | |
---|
500 | /** |
---|
501 | * Exports the fasta data of a list of assaysamples |
---|
502 | * @param assaySamples Assaysamples to export |
---|
503 | * @param outStream Outputstream to send the data to |
---|
504 | * @return |
---|
505 | */ |
---|
506 | public def export( List assaySamples, OutputStream outStream, String name = null ) { |
---|
507 | if( !assaySamples || assaySamples.size() == 0 ) |
---|
508 | return false; |
---|
509 | |
---|
510 | // Retrieve the filename from configuration, if none is given |
---|
511 | if( !name ) |
---|
512 | name = ConfigurationHolder.config.massSequencing.exportFilename |
---|
513 | |
---|
514 | // Determine the directory the uploaded files are stored in |
---|
515 | File permanentDirectory = fileService.absolutePath( ConfigurationHolder.config.massSequencing.fileDir ); |
---|
516 | |
---|
517 | // First check whether qual files should be exported or not |
---|
518 | // It is only exported if qual scores are available for all sequences |
---|
519 | def exportQual = ( assaySamples*.numSequences().sum() == assaySamples*.numQualScores().sum() ); |
---|
520 | |
---|
521 | // First create tags for every sample |
---|
522 | def tags = []; |
---|
523 | |
---|
524 | // Determine new tag length. Since we can use 4 characters per |
---|
525 | // tag position, we only have to use 4log( #samples) |
---|
526 | // The minimum number of characters used is 10, to ensure the correct working of the other |
---|
527 | // programs. |
---|
528 | int tagLength = computeTagLength( assaySamples.size() ) |
---|
529 | int tagNumber = 0; |
---|
530 | |
---|
531 | assaySamples.each { assaySample -> |
---|
532 | if( assaySample.numSequences() > 0 ) { |
---|
533 | // Create a new tag for this assaysample |
---|
534 | def tag = createTag( tagLength, tagNumber++); |
---|
535 | |
---|
536 | // Save the tag for exporting |
---|
537 | tags << [ assaySampleId: assaySample.id, sampleName: assaySample.sample.name, |
---|
538 | assayName: assaySample.assay.name, studyName: assaySample.assay.study.name, |
---|
539 | forwardPrimer: assaySample.fwPrimerSeq, reversePrimer: assaySample.revPrimerSeq, |
---|
540 | tag: tag |
---|
541 | ]; |
---|
542 | } |
---|
543 | } |
---|
544 | |
---|
545 | // Now create zip file for fasta and qual files |
---|
546 | ZipOutputStream zipFile = new ZipOutputStream( new BufferedOutputStream( outStream ) ); |
---|
547 | BufferedWriter zipWriter = new BufferedWriter( new OutputStreamWriter( zipFile ) ); |
---|
548 | |
---|
549 | // We have to loop twice through the sequenceData, since we can't write part of the sequence |
---|
550 | // file and part of the qual files mixed. We have to write the full sequence file first. |
---|
551 | try { |
---|
552 | zipFile.putNextEntry( new ZipEntry( name + ".fna" ) ); |
---|
553 | |
---|
554 | assaySamples.each { assaySample -> |
---|
555 | if( assaySample.numSequences() > 0 ) { |
---|
556 | def currentTag = tags.find { it.assaySampleId == assaySample.id }; |
---|
557 | |
---|
558 | assaySample.sequenceData.each { sequenceData -> |
---|
559 | copyFastaFileForExport( fileService.get( sequenceData.sequenceFile, permanentDirectory ), currentTag.tag, zipWriter) |
---|
560 | } |
---|
561 | } |
---|
562 | } |
---|
563 | zipWriter.flush(); |
---|
564 | zipFile.closeEntry(); |
---|
565 | |
---|
566 | if( exportQual ) { |
---|
567 | zipFile.putNextEntry( new ZipEntry( name + ".qual" ) ); |
---|
568 | |
---|
569 | assaySamples.each { assaySample -> |
---|
570 | if( assaySample.numSequences() > 0 ) { |
---|
571 | def currentTag = tags.find { it.assaySampleId == assaySample.id }; |
---|
572 | |
---|
573 | assaySample.sequenceData.each { sequenceData -> |
---|
574 | copyQualFileForExport( fileService.get( sequenceData.qualityFile, permanentDirectory ), currentTag.tag, zipWriter) |
---|
575 | } |
---|
576 | } |
---|
577 | } |
---|
578 | |
---|
579 | zipWriter.flush(); |
---|
580 | zipFile.closeEntry(); |
---|
581 | } |
---|
582 | |
---|
583 | } catch( Exception e ) { |
---|
584 | log.error "Error while writing to fastafile or qualfile: " + e.getMessage(); |
---|
585 | } finally { |
---|
586 | // Always close zip entry |
---|
587 | try { |
---|
588 | zipFile.closeEntry(); |
---|
589 | } catch( Exception e ) { |
---|
590 | log.error "Error while closing zip entry for fasta and qual: " + e.getMessage(); |
---|
591 | } |
---|
592 | } |
---|
593 | |
---|
594 | // Export a tab delimited file with tags |
---|
595 | zipFile.putNextEntry( new ZipEntry( name + ".tab" ) ); |
---|
596 | exportTabDelimitedSampleTagFile( tags, zipWriter ); |
---|
597 | zipWriter.flush(); |
---|
598 | zipFile.closeEntry(); |
---|
599 | |
---|
600 | // Export a mothur file with tags |
---|
601 | zipFile.putNextEntry( new ZipEntry( name + ".oligos" ) ); |
---|
602 | exportMothurSampleTagFile( tags, zipWriter ); |
---|
603 | zipWriter.flush(); |
---|
604 | zipFile.closeEntry(); |
---|
605 | |
---|
606 | // Export an excel file with information about the samples |
---|
607 | zipFile.putNextEntry( new ZipEntry( name + ".xls" ) ); |
---|
608 | sampleExcelService.exportExcelSampleData( assaySamples, tags, zipFile ); |
---|
609 | zipFile.flush(); |
---|
610 | zipFile.closeEntry(); |
---|
611 | |
---|
612 | zipFile.close(); |
---|
613 | } |
---|
614 | |
---|
615 | /** |
---|
616 | * Creates an oligos file for Mothur that represents the connection between samples |
---|
617 | * and the artificial tags. |
---|
618 | * |
---|
619 | * @see http://www.mothur.org/wiki/Trim.seqs#allfiles |
---|
620 | * @param tags Map with newly created tags |
---|
621 | * @param zipWriter Writer to write the data to |
---|
622 | */ |
---|
623 | protected void exportMothurSampleTagFile( List tags, Writer zipWriter ) { |
---|
624 | // Add the forward and reverse primers, as found in the assaysamples |
---|
625 | // The primers are already cut off, so they are not relevant anymore |
---|
626 | // For that reason, a '#' is prepended to each line. |
---|
627 | def fwPrimers = tags.collect { it.forwardPrimer }.findAll { it }.unique(); |
---|
628 | def revPrimers = tags.collect { it.reversePrimer }.findAll { it }.unique(); |
---|
629 | |
---|
630 | fwPrimers.each { zipWriter.write( "#forward\t" + it + "\n" ) } |
---|
631 | revPrimers.each { zipWriter.write( "#reverse\t" + it + "\n" ) } |
---|
632 | |
---|
633 | // Check whether the sample names are unique. If they aren't, the assay and study names |
---|
634 | // are appended to the sample name |
---|
635 | def sampleNames = tags*.sampleNames; |
---|
636 | if( sampleNames.unique().size() < sampleNames.size() ) { |
---|
637 | tags.each { |
---|
638 | it.uniqueName = it.sampleName + " (" + it.assayName + " / " + it.studyName + ")"; |
---|
639 | } |
---|
640 | } else { |
---|
641 | tags.each { |
---|
642 | it.uniqueName = it.sampleName; |
---|
643 | } |
---|
644 | } |
---|
645 | |
---|
646 | tags.each { |
---|
647 | // Samples names are not allowed to contain spaces |
---|
648 | def name = it.uniqueName.replaceAll( /\s+/, '-' ); |
---|
649 | zipWriter.write( "barcode\t" + it.tag + "\t" + name + "\n" ); |
---|
650 | } |
---|
651 | } |
---|
652 | |
---|
653 | /** |
---|
654 | * Creates a tab delimited file with two columns and column headers "Sequence" and "Samplename" |
---|
655 | * @param tags Map with newly created tags |
---|
656 | * @param zipWriter Writer to write the data to |
---|
657 | */ |
---|
658 | protected void exportTabDelimitedSampleTagFile( List tags, Writer zipWriter ) { |
---|
659 | zipWriter.write( "Sequence" + "\t" + "Samplename" + "\n" ); |
---|
660 | |
---|
661 | // Check whether the sample names are unique. If they aren't, the assay and study names |
---|
662 | // are appended to the sample name |
---|
663 | def sampleNames = tags*.sampleNames; |
---|
664 | if( sampleNames.unique().size() < sampleNames.size() ) { |
---|
665 | tags.each { |
---|
666 | it.uniqueName = it.sampleName + " (" + it.assayName + " / " + it.studyName + ")"; |
---|
667 | } |
---|
668 | } else { |
---|
669 | tags.each { |
---|
670 | it.uniqueName = it.sampleName; |
---|
671 | } |
---|
672 | } |
---|
673 | |
---|
674 | tags.each { |
---|
675 | def name = it.uniqueName.replaceAll( /\s+/, '-' ); |
---|
676 | zipWriter.write( it.tag + "\t" + name + "\n" ); |
---|
677 | } |
---|
678 | } |
---|
679 | |
---|
680 | /** |
---|
681 | * Deletes all sequences from a list of assaysamples |
---|
682 | * @param assaySample |
---|
683 | * @return |
---|
684 | */ |
---|
685 | public int deleteSequenceData( ArrayList assaySamples ) { |
---|
686 | def numFiles = 0; |
---|
687 | assaySamples?.each { assaySample -> |
---|
688 | numFiles += assaySample.deleteSequenceData(); |
---|
689 | } |
---|
690 | |
---|
691 | return numFiles; |
---|
692 | } |
---|
693 | |
---|
694 | /** |
---|
695 | * Creates a unique tag for the given number |
---|
696 | * @param length |
---|
697 | * @param tagNumber |
---|
698 | * @return |
---|
699 | */ |
---|
700 | public String createTag( int length, int tagNumber ) { |
---|
701 | def chars = ["C", "A", "G", "T"]; |
---|
702 | def numChars = chars.size(); |
---|
703 | |
---|
704 | // Determine whether the tagNumber is not too high |
---|
705 | if( tagNumber > numChars * ( ( numChars - 1 ) ** ( length - 1 ) ) ) |
---|
706 | throw new Exception( "Given tag number (" + tagNumber + ") is too large for the specified length (" + length + ")") |
---|
707 | |
---|
708 | String tag = ""; |
---|
709 | |
---|
710 | // Loop through all characters |
---|
711 | for( def i = 0; i < length; i++ ) { |
---|
712 | def currentChars |
---|
713 | |
---|
714 | // Make sure the characters to choose from are different from |
---|
715 | // the previous character |
---|
716 | if( i == 0 ) |
---|
717 | currentChars = chars; |
---|
718 | else |
---|
719 | currentChars = chars - tag[ 0 ] |
---|
720 | |
---|
721 | // Determine which character to append by taking the tagNumber modulo the number of characters |
---|
722 | int currentChar = tagNumber % currentChars.size() |
---|
723 | tag = currentChars[ currentChar ] + tag; |
---|
724 | |
---|
725 | tagNumber = Math.floor( tagNumber / currentChars.size() ); |
---|
726 | } |
---|
727 | |
---|
728 | return tag; |
---|
729 | } |
---|
730 | |
---|
731 | /** |
---|
732 | * Computes the number of characters needed for a tag to be unique for the given number of samples |
---|
733 | * @param numSamples Number of samples to generate tags for |
---|
734 | * @return Number of characters needed for a unique tag |
---|
735 | */ |
---|
736 | public int computeTagLength( int numSamples ) { |
---|
737 | // No homopolymers are allowed in the tag. For that reason we have |
---|
738 | // 4 possibilities for the first character, and 3 for every next character |
---|
739 | |
---|
740 | // In order to allow enough characters, we take the 3log(numSamples), with a |
---|
741 | // minimum of 10 |
---|
742 | return Math.max( 10, Math.ceil( Math.log( numSamples ) / Math.log( 3 ) ) ); |
---|
743 | } |
---|
744 | |
---|
745 | /** |
---|
746 | * Copies the contents of the given sequence file to the output file and prepends the tag to every sequences |
---|
747 | * @param inFile Filename of the file to be read |
---|
748 | * @param tag |
---|
749 | * @param outWriter |
---|
750 | * @return |
---|
751 | */ |
---|
752 | protected boolean copyFastaFileForExport( File inFile, String tag, BufferedWriter outWriter ) { |
---|
753 | // Walk through the lines in the file, starting with '>' |
---|
754 | // (and where the following line contains a character other than '>') |
---|
755 | |
---|
756 | try { |
---|
757 | BufferedReader inReader = new BufferedReader( new FileReader( inFile ) ); |
---|
758 | |
---|
759 | String line = null |
---|
760 | String newLine = null |
---|
761 | String sequence = ""; |
---|
762 | |
---|
763 | def lengthPattern = ~/length=(\d+)/ |
---|
764 | def lengthMatches |
---|
765 | int length = 0; |
---|
766 | int tagLength = tag.size(); |
---|
767 | |
---|
768 | while( ( line = inReader.readLine()) != null) { |
---|
769 | if( line.size() == 0 ) { |
---|
770 | // Print the sequence we collected, before writing the empty line |
---|
771 | printSequence( outWriter, sequence, tag ); |
---|
772 | sequence = ""; |
---|
773 | |
---|
774 | // Empty line |
---|
775 | outWriter.newLine(); |
---|
776 | } else if( line[ 0 ] == '>' ) { |
---|
777 | // Print the sequence we collected, before writing the new comments tag |
---|
778 | printSequence( outWriter, sequence, tag ); |
---|
779 | sequence = ""; |
---|
780 | |
---|
781 | // Comments line: replace length=### with the |
---|
782 | // updated length, and put the line in the |
---|
783 | lengthMatches = ( line =~ lengthPattern ); |
---|
784 | if( lengthMatches ) { |
---|
785 | length = Integer.valueOf( lengthMatches[0][1] ) + tagLength; |
---|
786 | newLine = lengthMatches.replaceAll( "length=" + length ); |
---|
787 | } |
---|
788 | |
---|
789 | outWriter.write(newLine); |
---|
790 | outWriter.newLine(); |
---|
791 | } else { |
---|
792 | // This is part of the sequence. We collect the whole sequence and |
---|
793 | // determine in the end how to write it to the file |
---|
794 | sequence += line; |
---|
795 | } |
---|
796 | } |
---|
797 | |
---|
798 | // Print the sequence we collected, before ending the file |
---|
799 | printSequence( outWriter, sequence, tag ); |
---|
800 | sequence = ""; |
---|
801 | |
---|
802 | } catch( Exception e ) { |
---|
803 | log.error( "An error occurred while copying contents from " + inFile.getName() + ": " + e.getMessage() ); |
---|
804 | return false; |
---|
805 | } |
---|
806 | } |
---|
807 | |
---|
808 | /** |
---|
809 | * Prints a sequence to the output file |
---|
810 | * @param outWriter |
---|
811 | * @param sequence |
---|
812 | * @param tag |
---|
813 | */ |
---|
814 | private void printSequence( BufferedWriter outWriter, String sequence, String tag, int maxWidth = 60 ) { |
---|
815 | // If no sequence is given, also don't prepend it with the tag |
---|
816 | if( sequence.size() == 0 ) |
---|
817 | return |
---|
818 | |
---|
819 | // Prepend the tag to the sequence |
---|
820 | sequence = tag + sequence; |
---|
821 | |
---|
822 | // Write the sequence with a width of maxWidth characters per line |
---|
823 | while( sequence ) { |
---|
824 | if( sequence.size() > maxWidth ) { |
---|
825 | outWriter.write( sequence[0..maxWidth-1] ); |
---|
826 | sequence = sequence[maxWidth..-1] |
---|
827 | } else { |
---|
828 | outWriter.write( sequence ); |
---|
829 | sequence = null; |
---|
830 | } |
---|
831 | outWriter.newLine(); |
---|
832 | } |
---|
833 | } |
---|
834 | |
---|
835 | /** |
---|
836 | * Copies the contents of the given qual file to the output file and prepends the tag quality score to every sequence. |
---|
837 | * For every tag character '40' is prepended to the qual scores |
---|
838 | * |
---|
839 | * @param inFile Filename of the file to be read |
---|
840 | * @param tag |
---|
841 | * @param outWriter |
---|
842 | * @return |
---|
843 | */ |
---|
844 | protected boolean copyQualFileForExport( File inFile, String tag, BufferedWriter outWriter ) { |
---|
845 | // Walk through the lines in the file, starting with '>' |
---|
846 | // (and where the following line contains a character other than '>') |
---|
847 | try { |
---|
848 | BufferedReader inReader = new BufferedReader( new FileReader( inFile ) ); |
---|
849 | |
---|
850 | String line = null |
---|
851 | String newLine = null |
---|
852 | List<Integer> qualScores = [] |
---|
853 | |
---|
854 | def lengthPattern = ~/length=(\d+)/ |
---|
855 | def lengthMatches |
---|
856 | int length = 0; |
---|
857 | int tagLength = tag.size(); |
---|
858 | |
---|
859 | while( ( line = inReader.readLine()) != null) { |
---|
860 | if( line.size() == 0 ) { |
---|
861 | // Print the quality scores we collected, before writing the empty line |
---|
862 | printQualScore( outWriter, qualScores, tagLength ); |
---|
863 | qualScores = []; |
---|
864 | |
---|
865 | // Empty line |
---|
866 | outWriter.newLine(); |
---|
867 | } else if( line[ 0 ] == '>' ) { |
---|
868 | // Print the quality scores we collected, before writing the empty line |
---|
869 | printQualScore( outWriter, qualScores, tagLength ); |
---|
870 | qualScores = []; |
---|
871 | |
---|
872 | // Comments line: replace length=### with the |
---|
873 | // updated length, and put the line in the |
---|
874 | lengthMatches = ( line =~ lengthPattern ); |
---|
875 | if( lengthMatches ) { |
---|
876 | length = Integer.valueOf( lengthMatches[0][1] ) + tagLength; |
---|
877 | newLine = lengthMatches.replaceAll( "length=" + length ); |
---|
878 | } |
---|
879 | |
---|
880 | outWriter.write(newLine); |
---|
881 | outWriter.newLine(); |
---|
882 | } else { |
---|
883 | // This is part of the quality score. We collect the whole set of quality |
---|
884 | // scores and determine in the end how to write it to the file |
---|
885 | qualScores += line.split( " " ).collect { |
---|
886 | if( !it.isInteger() ) |
---|
887 | return 0; |
---|
888 | else |
---|
889 | return Integer.parseInt( it ); |
---|
890 | }; |
---|
891 | } |
---|
892 | } |
---|
893 | |
---|
894 | // Print the quality scores we collected, before ending the file |
---|
895 | printQualScore( outWriter, qualScores, tagLength ); |
---|
896 | qualScores = []; |
---|
897 | |
---|
898 | } catch( Exception e ) { |
---|
899 | log.error( "An error occurred while copying contents from " + inFile.getName() + ": " + e.getMessage() ); |
---|
900 | return false; |
---|
901 | } |
---|
902 | } |
---|
903 | |
---|
904 | /** |
---|
905 | * Prints a sequence to the output file |
---|
906 | * @param outWriter |
---|
907 | * @param sequence |
---|
908 | * @param tag |
---|
909 | */ |
---|
910 | private void printQualScore( BufferedWriter outWriter, List<Integer> qualScores, int tagLength, int maxWidth = 60 ) { |
---|
911 | // If no qualScores are given, also don't prepend it with the tag |
---|
912 | if( qualScores.size() == 0 ) |
---|
913 | return |
---|
914 | |
---|
915 | // Prepend the tag to the sequence |
---|
916 | qualScores = Collections.nCopies( tagLength, 40 ) + qualScores; |
---|
917 | |
---|
918 | // Write the sequence with a width of maxWidth characters per line |
---|
919 | while( qualScores ) { |
---|
920 | if( qualScores.size() > maxWidth ) { |
---|
921 | outWriter.write( qualScores[0..maxWidth-1].join( " " ) ); |
---|
922 | qualScores = qualScores[maxWidth..-1] |
---|
923 | } else { |
---|
924 | outWriter.write( qualScores.join( " " ) ); |
---|
925 | qualScores = null; |
---|
926 | } |
---|
927 | outWriter.newLine(); |
---|
928 | } |
---|
929 | } |
---|
930 | |
---|
931 | } |
---|