1 | package nl.tno.massSequencing |
---|
2 | |
---|
3 | import java.io.BufferedWriter; |
---|
4 | import java.io.File; |
---|
5 | import java.io.Writer; |
---|
6 | import java.util.ArrayList; |
---|
7 | import java.util.List; |
---|
8 | import org.codehaus.groovy.grails.commons.ConfigurationHolder |
---|
9 | import java.util.zip.* |
---|
10 | |
---|
11 | class FastaService implements nl.tno.massSequencing.imports.Importer { |
---|
12 | def fileService |
---|
13 | def fuzzySearchService |
---|
14 | def sampleExcelService |
---|
15 | def classificationService |
---|
16 | def excelService |
---|
17 | def workerService |
---|
18 | |
---|
19 | static transactional = false |
---|
20 | |
---|
21 | // After exporting sequence and qual files, the GSCF data has to be retrieved. However, |
---|
22 | // we don't know how long that takes, but is it proportional to the number of samples |
---|
23 | // For that reason, we append a number to the total size, so the progress bar will keep |
---|
24 | // some space open while fetching GSCF data |
---|
25 | def AMOUNT_PER_ASSAYSAMPLE = 100000; |
---|
26 | |
---|
27 | /** |
---|
28 | * Determines whether a file can be processed. |
---|
29 | * @param filetype Filetype of the file |
---|
30 | * @see determineFileType() |
---|
31 | * @return |
---|
32 | */ |
---|
33 | public boolean canParseFileType( String filetype ) { |
---|
34 | switch( filetype ) { |
---|
35 | case "fasta": |
---|
36 | case "qual": |
---|
37 | return true; |
---|
38 | default: |
---|
39 | return false; |
---|
40 | } |
---|
41 | } |
---|
42 | |
---|
43 | /** |
---|
44 | * Parses the given FASTA or QUAL file |
---|
45 | * |
---|
46 | * @param file File to parse |
---|
47 | * @param filetype Type of the given file |
---|
48 | * @param onProgress Closure to execute when progress indicators should be updated. |
---|
49 | * Has 2 parameters: progress that indicates the number of bytes that have been processed. The second parameter determines |
---|
50 | * the number of bytes that has to be processed extra after this action (e.g. when a zip file is extracted, the extracted |
---|
51 | * files should be added to the total number of bytes to be processed) |
---|
52 | * @return Structure with information about the parsed files. The 'success' files are |
---|
53 | * moved to the given directory |
---|
54 | * |
---|
55 | * Examples: |
---|
56 | * [filename: 'abc.fasta', type: FASTA, numSequences: 190] |
---|
57 | * [filename: 'cde.qual', type: QUAL, numSequences: 140, avgQuality: 29] |
---|
58 | * |
---|
59 | * [filename: 'test.zip', type: ZIP, extraFiles: [newfile1.xls, newfile2.xls, newfile3.xls]] |
---|
60 | * |
---|
61 | * [filename: 'testing.xls', type: 'unknown', message: 'Type not recognized'] |
---|
62 | * |
---|
63 | */ |
---|
64 | |
---|
65 | public Map parseFile( File file, String filetype, Closure onProgress ) { |
---|
66 | switch( filetype ) { |
---|
67 | case "fasta": |
---|
68 | return parseFasta( file, onProgress ); |
---|
69 | case "qual": |
---|
70 | return parseQual( file, onProgress ); |
---|
71 | default: |
---|
72 | return [ success: false, type: filetype, message: 'Filetype could not be parsed.' ] |
---|
73 | } |
---|
74 | } |
---|
75 | |
---|
76 | /** |
---|
77 | * Parses the given FASTA file |
---|
78 | * @param file File to parse |
---|
79 | * @param onProgress Closure to execute when progress indicators should be updated. |
---|
80 | * Has 2 parameters: numFilesProcessed and numBytesProcessed that indicate the number |
---|
81 | * of files and bytes that have been processed in this file (so the first parameter should |
---|
82 | * only be 1 when the file is finished) |
---|
83 | * @return List structure. Examples: |
---|
84 | * |
---|
85 | * [ success: true, filename: 'abc.fasta', type: 'fasta', numSequences: 200 ] |
---|
86 | * [ success: false, filename: 'def.fasta', type: 'fasta', message: 'File is not a valid FASTA file' ] |
---|
87 | */ |
---|
88 | protected def parseFasta( File file, Closure onProgress ) { |
---|
89 | |
---|
90 | long startTime = System.nanoTime(); |
---|
91 | log.trace "Start parsing FASTA " + file.getName() |
---|
92 | |
---|
93 | // Count the number of lines, starting with '>' (and where the following line contains a character other than '>') |
---|
94 | long numSequences = 0; |
---|
95 | long bytesProcessed = 0; |
---|
96 | boolean lookingForSequence = false; |
---|
97 | |
---|
98 | file.eachLine { line -> |
---|
99 | if( line ) { |
---|
100 | if( !lookingForSequence && line[0] == '>' ) { |
---|
101 | lookingForSequence = true; |
---|
102 | } else if( lookingForSequence ) { |
---|
103 | if( line[0] != '>' ) { |
---|
104 | numSequences++; |
---|
105 | lookingForSequence = false; |
---|
106 | } |
---|
107 | } |
---|
108 | |
---|
109 | |
---|
110 | // Update progress every time 1MB is processed |
---|
111 | bytesProcessed += line.size(); |
---|
112 | if( bytesProcessed > 1000000 ) { |
---|
113 | onProgress( bytesProcessed, 0 ); |
---|
114 | bytesProcessed = 0; |
---|
115 | } |
---|
116 | } |
---|
117 | |
---|
118 | } |
---|
119 | |
---|
120 | // Update progress and say we're finished |
---|
121 | onProgress( bytesProcessed, 0 ); |
---|
122 | |
---|
123 | log.trace "Finished parsing FASTA " + file.getName() + ": " + ( System.nanoTime() - startTime ) / 1000000L |
---|
124 | |
---|
125 | return [ success: true, type: "fasta", filename: file.getName(), numSequences: numSequences ]; |
---|
126 | } |
---|
127 | |
---|
128 | /** |
---|
129 | * Parses the given FASTA file and returns a list with sequence lengths |
---|
130 | * @param file File to parse |
---|
131 | * @return List [ 400: 12, 410: 15, 'unknown': 14 ] |
---|
132 | */ |
---|
133 | protected def parseFastaLengths( File file ) { |
---|
134 | |
---|
135 | long startTime = System.nanoTime(); |
---|
136 | log.trace "Start parsing FASTA " + file.getName() |
---|
137 | |
---|
138 | // Count the number of lines, starting with '>' (and where the following line contains a character other than '>') |
---|
139 | def histogram = [:] |
---|
140 | def lengthPattern = ~/length=(\d+)/ |
---|
141 | def length |
---|
142 | def lengthMatches |
---|
143 | def currentLength = 0; |
---|
144 | def countingLines = false; |
---|
145 | try { |
---|
146 | file.eachLine { line -> |
---|
147 | if( line ) { |
---|
148 | if( line[0] == '>' ) { |
---|
149 | if( countingLines ) { |
---|
150 | // If counting the length in lines, we must stop now and store the length |
---|
151 | histogram[ currentLength ] = ( histogram[ currentLength ] ?: 0 ) + 1; |
---|
152 | countingLines = false; |
---|
153 | } |
---|
154 | |
---|
155 | // Comments line: find length=### |
---|
156 | lengthMatches = ( line =~ lengthPattern ); |
---|
157 | if( lengthMatches ) { |
---|
158 | length = Integer.valueOf( lengthMatches[0][1] ); |
---|
159 | |
---|
160 | histogram[ length ] = ( histogram[length] ?: 0 ) + 1; |
---|
161 | } else { |
---|
162 | // Length is not mentioned in the comments line |
---|
163 | countingLines = true; |
---|
164 | currentLength = 0; |
---|
165 | } |
---|
166 | } else if( countingLines ) { |
---|
167 | // Compute the length by counting the files themselves. |
---|
168 | currentLength += line.trim().size(); |
---|
169 | } |
---|
170 | } |
---|
171 | } |
---|
172 | } catch( Exception e ) { |
---|
173 | log.error( "Exception while reading fasta file " + file + ": " + e.getMessage() ) |
---|
174 | } |
---|
175 | |
---|
176 | log.trace "Finished parsing FASTA " + file.getName() + ": " + ( System.nanoTime() - startTime ) / 1000000L |
---|
177 | log.trace "Histogram: " + histogram |
---|
178 | |
---|
179 | return histogram; |
---|
180 | } |
---|
181 | |
---|
182 | /** |
---|
183 | * Parses the given QUAL file |
---|
184 | * @param file File to parse |
---|
185 | * @param onProgress Closure to execute when progress indicators should be updated. |
---|
186 | * Has 2 parameters: numFilesProcessed and numBytesProcessed that indicate the number |
---|
187 | * of files and bytes that have been processed in this file (so the first parameter should |
---|
188 | * only be 1 when the file is finished) |
---|
189 | * @return List structure. Examples: |
---|
190 | * |
---|
191 | * [ success: true, filename: 'abc.qual', type: 'qual', numSequences: 200, avgQuality: 31 ] |
---|
192 | * [ success: false, filename: 'def.qual', type: 'qual', message: 'File is not a valid QUAL file' ] |
---|
193 | */ |
---|
194 | protected def parseQual( File file, Closure onProgress ) { |
---|
195 | long startTime = System.nanoTime(); |
---|
196 | log.trace "Start parsing QUAL " + file.getName() |
---|
197 | |
---|
198 | // Count the number of lines, starting with '>'. After we've found such a character, we continue looking for |
---|
199 | // quality scores |
---|
200 | long numSequences = 0; |
---|
201 | long bytesProcessed = 0; |
---|
202 | def quality = [ quality: 0.0, number: 0L ] |
---|
203 | |
---|
204 | boolean lookingForFirstQualityScores = false; |
---|
205 | file.eachLine { line -> |
---|
206 | if( line ) { |
---|
207 | if( !lookingForFirstQualityScores && line[0] == '>' ) { |
---|
208 | lookingForFirstQualityScores = true; |
---|
209 | } else if( lookingForFirstQualityScores ) { |
---|
210 | if( line[0] != '>' ) { |
---|
211 | numSequences++; |
---|
212 | lookingForFirstQualityScores = false; |
---|
213 | |
---|
214 | // Don't compute average quality because it takes too much time |
---|
215 | //quality = updateQuality( quality, line ); |
---|
216 | } |
---|
217 | } else { |
---|
218 | // Don't compute average quality because it takes too much time |
---|
219 | //quality = updateQuality( quality, line ); |
---|
220 | } |
---|
221 | |
---|
222 | // Update progress every time 1MB is processed |
---|
223 | bytesProcessed += line.size(); |
---|
224 | if( bytesProcessed > 1000000 ) { |
---|
225 | onProgress( bytesProcessed, 0 ); |
---|
226 | bytesProcessed = 0; |
---|
227 | } |
---|
228 | } |
---|
229 | |
---|
230 | } |
---|
231 | |
---|
232 | // Update progress and say we're finished |
---|
233 | onProgress( bytesProcessed, 0 ); |
---|
234 | |
---|
235 | log.trace "Finished parsing QUAL " + file.getName() + ": " + ( System.nanoTime() - startTime ) / 1000000L |
---|
236 | |
---|
237 | return [ success: true, type: "qual", filename: file.getName(), numSequences: numSequences, avgQuality: quality.quality ]; |
---|
238 | } |
---|
239 | |
---|
240 | |
---|
241 | /** |
---|
242 | * Matches uploaded fasta and qual files and combines them with the samples they probably belong to |
---|
243 | * @param parsedFiles Parsed files |
---|
244 | * [ |
---|
245 | * [filename: 'abc.fasta', type: FASTA, numSequences: 190] |
---|
246 | * [filename: 'cde.fasta', type: FASTA, numSequences: 140] |
---|
247 | * [filename: 'abc.qual', type: QUAL, numSequences: 190, avgQuality: 38] |
---|
248 | * [filename: 'cde.qual', type: QUAL, numSequences: 140, avgQuality: 29] |
---|
249 | * [filename: 'match.xls', type: EXCEL, matches: [ [ filename: 'abc.fasta', basename: 'abc', sample: 's1' ] ] |
---|
250 | * ] |
---|
251 | * @param samples AssaySample objects to which the files should be matched. |
---|
252 | * @return Structure with information about the matching. |
---|
253 | * [ |
---|
254 | * [ |
---|
255 | * fasta: [filename: 'abc.fasta', type: FASTA, numSequences: 190], |
---|
256 | * qual: [filename: 'abc.qual', type: QUAL, numSequences: 190, avgQuality: 38], |
---|
257 | * feasibleQuals: [ |
---|
258 | * [filename: 'abc.qual', type: QUAL, numSequences: 190, avgQuality: 38], |
---|
259 | * [filename: 'def.qual', type: QUAL, numSequences: 190, avgQuality: 21] |
---|
260 | * ] |
---|
261 | * sample: AssaySample object |
---|
262 | */ |
---|
263 | def matchFiles( def parsedFiles, def samples ) { |
---|
264 | def fastas = parsedFiles.findAll { it.type == "fasta" } |
---|
265 | def quals = parsedFiles.findAll { it.type == "qual" } |
---|
266 | def classifications = parsedFiles.findAll { it.type == "taxonomy" } |
---|
267 | def logfiles = parsedFiles.findAll { it.type == "logfile" } |
---|
268 | def excels = parsedFiles.findAll { it.type == "excel" } |
---|
269 | |
---|
270 | samples = samples.toList() |
---|
271 | |
---|
272 | // Collect (filename) matches from all excel files |
---|
273 | def matches = []; |
---|
274 | excels.each { m -> |
---|
275 | if( m.matches && m.matches.filenames ) { |
---|
276 | // [ filename: filename, samplename: GSCFSample, basename: basename ] |
---|
277 | m.matches.filenames.each { matches << it } |
---|
278 | } |
---|
279 | } |
---|
280 | |
---|
281 | def files = []; |
---|
282 | |
---|
283 | fastas.each { fastaFile -> |
---|
284 | // Remove extension |
---|
285 | def matchWith = fastaFile.originalfilename.substring( 0, fastaFile.originalfilename.lastIndexOf( '.' ) ) |
---|
286 | |
---|
287 | // Determine feasible quals (based on number of sequences ) |
---|
288 | def feasibleQuals = quals.findAll { it.numSequences == fastaFile.numSequences } |
---|
289 | def feasibleClassifications = classifications.findAll { it.numLines == fastaFile.numSequences } |
---|
290 | def feasibleLogfiles = logfiles; |
---|
291 | |
---|
292 | // Best matching qual file |
---|
293 | def qualIdx = fuzzySearchService.mostSimilarWithIndex( matchWith + '.qual', feasibleQuals.originalfilename ); |
---|
294 | |
---|
295 | def qual = null |
---|
296 | if( qualIdx != null ) |
---|
297 | qual = feasibleQuals[ qualIdx ]; |
---|
298 | |
---|
299 | // Best matching classification file |
---|
300 | def classificationIdx = fuzzySearchService.mostSimilarWithIndex( matchWith + '.taxonomy', feasibleClassifications.originalfilename ); |
---|
301 | |
---|
302 | def classification = null |
---|
303 | if( classificationIdx != null ) |
---|
304 | classification = feasibleClassifications[ classificationIdx ]; |
---|
305 | |
---|
306 | // Best matching classification file |
---|
307 | def logfileIdx = fuzzySearchService.mostSimilarWithIndex( matchWith + '.logfile', feasibleLogfiles.originalfilename ); |
---|
308 | |
---|
309 | def logfile = null |
---|
310 | if( logfileIdx != null ) |
---|
311 | logfile = feasibleLogfiles[ logfileIdx ]; |
---|
312 | |
---|
313 | // Best matching sample |
---|
314 | def assaySample = null |
---|
315 | def checked = true; |
---|
316 | if( matches ) { |
---|
317 | // Match with files from excelsheet |
---|
318 | |
---|
319 | // First find the best matching filename in the list of matches. |
---|
320 | def sampleNameIdx = fuzzySearchService.mostSimilarWithIndex( matchWith, matches*.basename, 0.8 ); |
---|
321 | |
---|
322 | // If one is found, use the sample name associated with it to do the matching with samples |
---|
323 | if( sampleNameIdx != null ) { |
---|
324 | matchWith = matches[ sampleNameIdx ].samplename; |
---|
325 | } else { |
---|
326 | // If no match is found in the excel sheet, this sample is unchecked by default |
---|
327 | checked = false; |
---|
328 | } |
---|
329 | } |
---|
330 | |
---|
331 | // Match on filenames |
---|
332 | def sampleIdx = fuzzySearchService.mostSimilarWithIndex( matchWith, samples.sample.name ); |
---|
333 | if( sampleIdx != null ) { |
---|
334 | assaySample = samples[ sampleIdx ]; |
---|
335 | } |
---|
336 | |
---|
337 | files << [ |
---|
338 | fasta: fastaFile, |
---|
339 | feasibleQuals: feasibleQuals, |
---|
340 | feasibleClassifications: feasibleClassifications, |
---|
341 | feasibleLogfiles: feasibleLogfiles, |
---|
342 | qual: qual, |
---|
343 | classification: classification, |
---|
344 | logfile: logfile, |
---|
345 | assaySample: assaySample, |
---|
346 | checked: checked |
---|
347 | ] |
---|
348 | } |
---|
349 | |
---|
350 | return files; |
---|
351 | |
---|
352 | |
---|
353 | } |
---|
354 | |
---|
355 | |
---|
356 | /** |
---|
357 | * Parses a given excel file with a match between filenames and samples |
---|
358 | * @param file File to parse |
---|
359 | * @param onProgress Closure to execute when progress indicators should be updated. |
---|
360 | * Has 2 parameters: numFilesProcessed and numBytesProcessed that indicate the number |
---|
361 | * of files and bytes that have been processed in this file (so the first parameter should |
---|
362 | * only be 1 when the file is finished) |
---|
363 | * @return List structure. The matches array contains an array of matches between filenames and sample(name)s. |
---|
364 | * The extension for all files are removed in the 'basename' parameter, in order to improve matching. |
---|
365 | * Examples: |
---|
366 | * |
---|
367 | * [ success: true, filename: 'abc.xls', type: 'excel', matches: [ [ filename: 's1.qual', basename: 's1', sample: 'sample a' ], [ filename: 's9.fna', basename: 's9', sample: 'sample b' ] ] |
---|
368 | * [ success: false, filename: 'def.xls', type: 'excel', message: 'File is not a valid XLS file' ] |
---|
369 | */ |
---|
370 | public def inferExcelMatches( parsedFiles ) { |
---|
371 | // Loop through all excel files, and determine which (usable) columns are in the file |
---|
372 | for( parsedFile in parsedFiles ) { |
---|
373 | if( parsedFile.type == "excel" ) { |
---|
374 | |
---|
375 | def header = parsedFile.header; |
---|
376 | def data = parsedFile.data |
---|
377 | |
---|
378 | // Check whether (at least) 2 columns are present |
---|
379 | if( header.size() < 2 ) { |
---|
380 | parsedFile.success = false |
---|
381 | parsedFile.exceldata = [] |
---|
382 | parsedFile.message = "Excel file must contain at least 2 columns, one with filenames and one with sample names" |
---|
383 | continue; |
---|
384 | } |
---|
385 | |
---|
386 | // Check the headers to see whether any columns are found |
---|
387 | def columns = [:] |
---|
388 | def matches = [ "filenames": [], "mothursamples": [] ] |
---|
389 | |
---|
390 | header.eachWithIndex { cell, i -> |
---|
391 | switch( cell?.toLowerCase() ) { |
---|
392 | case "gscfsample": |
---|
393 | case "sample": |
---|
394 | columns[ "GSCFSample" ] = i; break; |
---|
395 | case "file": |
---|
396 | case "filename": |
---|
397 | columns[ "filename" ] = i; break; |
---|
398 | case "mothursample": |
---|
399 | columns[ "MothurSample" ] = i; break; |
---|
400 | } |
---|
401 | } |
---|
402 | |
---|
403 | // Loop through the data and find matches |
---|
404 | data.each { row -> |
---|
405 | def filename = columns[ "filename" ] != null ? row[ columns[ "filename" ] ] : null; |
---|
406 | def GSCFSample = columns[ "GSCFSample" ] != null ? row[ columns[ "GSCFSample" ] ] : null; |
---|
407 | def mothurSample = columns[ "MothurSample" ] != null ? row[ columns[ "MothurSample" ] ] : null; |
---|
408 | |
---|
409 | // Matches only exist if GSCF sample is entered in this row |
---|
410 | if( GSCFSample && GSCFSample != "null" ) { |
---|
411 | if( filename && filename != "null" ) { |
---|
412 | // Remove extension from the filename, but only if it is |
---|
413 | // .fna, .fasta, .qual, .fqa, .xls or .xlsx. Otherwise useful parts of the filename would be removed. |
---|
414 | def basename = ( filename =~ /\.(fna|fasta|qual|fqa|xls|xlsx)$/ ).replaceAll( "" ); |
---|
415 | |
---|
416 | matches[ "filenames" ] << [ filename: filename, samplename: GSCFSample, basename: basename ]; |
---|
417 | } |
---|
418 | |
---|
419 | if( mothurSample && mothurSample != "null" ) { |
---|
420 | matches[ "mothursamples" ] << [ samplename: GSCFSample, mothurname: mothurSample ]; |
---|
421 | } |
---|
422 | } |
---|
423 | } |
---|
424 | |
---|
425 | parsedFile[ 'columns' ] = columns; |
---|
426 | parsedFile[ 'matches' ] = matches; |
---|
427 | } |
---|
428 | } |
---|
429 | |
---|
430 | return parsedFiles |
---|
431 | } |
---|
432 | |
---|
433 | /** |
---|
434 | * Parses the given line and updates the average quality based on the scores |
---|
435 | * @param quality [quality: 0.0f, number: 0L] |
---|
436 | * @param line String of integer quality scores, separated by a whitespace |
---|
437 | * @return [quality: 0.0f, number: 0L] |
---|
438 | */ |
---|
439 | protected def updateQuality( def quality, String line ) { |
---|
440 | // Determine current average |
---|
441 | List tokens = line.tokenize(); |
---|
442 | Long total = 0; |
---|
443 | |
---|
444 | tokens.each { |
---|
445 | total += Integer.parseInt( it ); |
---|
446 | } |
---|
447 | |
---|
448 | int numTokens = tokens.size(); |
---|
449 | |
---|
450 | // Update the given average |
---|
451 | if( numTokens > 0 ) { |
---|
452 | quality.number += numTokens; |
---|
453 | quality.quality = quality.quality + ( ( total / numTokens as double ) - quality.quality ) / quality.number * numTokens; |
---|
454 | } |
---|
455 | |
---|
456 | return quality |
---|
457 | } |
---|
458 | |
---|
459 | /** |
---|
460 | * Moves a fasta and qual file to their permanent location, and returns information about these files |
---|
461 | * |
---|
462 | * @param fastaFile Filename of the fasta file in the temporary directory |
---|
463 | * @param qualFile Filename of the fasta file in the temporary directory |
---|
464 | * @param processedFiles Structure with data about uploaded files and matches |
---|
465 | * @return [ fasta: <fasta filename>, qual: <qual filename>, numSequences: <number of sequences>, avgQuality: <average quality> ] |
---|
466 | */ |
---|
467 | public def savePermanent( String fastaFile, String qualFile, String logFile, def processedFiles ) { |
---|
468 | File permanentDirectory = fileService.absolutePath( ConfigurationHolder.config.massSequencing.fileDir ); |
---|
469 | def returnStructure = [:]; |
---|
470 | |
---|
471 | if( fileService.fileExists( fastaFile ) ) { |
---|
472 | // Lookup the original filename |
---|
473 | def fastaData = processedFiles?.parsed?.success?.find { it.filename == fastaFile }; |
---|
474 | if( fastaData ) { |
---|
475 | returnStructure.fasta = fileService.moveFileToUploadDir( fileService.get( fastaFile ), fastaData.originalfilename, permanentDirectory ); |
---|
476 | returnStructure.numSequences = fastaData.numSequences; |
---|
477 | } else { |
---|
478 | throw new Exception( "Fasta file wasn't uploaded the right way. Maybe the session has expired/" ); |
---|
479 | } |
---|
480 | } else { |
---|
481 | // If the file doesn't exist, we can't save anything to the database. |
---|
482 | throw new Exception( "Fasta file to save doesn't exist on disk" ); |
---|
483 | } |
---|
484 | |
---|
485 | if( qualFile && fileService.fileExists( qualFile ) ) { |
---|
486 | // Lookup the original filename |
---|
487 | def qualData = processedFiles.parsed.success.find { it.filename == qualFile }; |
---|
488 | |
---|
489 | if( qualData ) { |
---|
490 | returnStructure.qual = fileService.moveFileToUploadDir( fileService.get(qualFile), qualData.originalfilename, permanentDirectory ); |
---|
491 | returnStructure.avgQuality = qualData.avgQuality |
---|
492 | } else { |
---|
493 | // Delete the uploaded fasta file, since this is a serious error |
---|
494 | fileService.delete( returnStructure.fasta, permanentDirectory ); |
---|
495 | throw new Exception( "Qual file wasn't uploaded the right way. Maybe the session has expired" ); |
---|
496 | } |
---|
497 | } else { |
---|
498 | // If the file doesn't exist, we don't save any quality information |
---|
499 | returnStructure.qual = null |
---|
500 | returnStructure.avgQuality = 0; |
---|
501 | } |
---|
502 | |
---|
503 | if( logFile && fileService.fileExists( logFile ) ) { |
---|
504 | // Lookup the original filename |
---|
505 | def logData = processedFiles.parsed.success.find { it.filename == logFile }; |
---|
506 | |
---|
507 | if( logData ) { |
---|
508 | returnStructure.logfile = fileService.moveFileToUploadDir( fileService.get(logFile), logData.originalfilename, permanentDirectory, true ); |
---|
509 | } else { |
---|
510 | // Delete the uploaded fasta file, since this is a serious error |
---|
511 | fileService.delete( returnStructure.fasta, permanentDirectory ); |
---|
512 | |
---|
513 | if( returnStructure.qual ) |
---|
514 | fileService.delete( returnStructure.qual, permanentDirectory ); |
---|
515 | |
---|
516 | throw new Exception( "Log file wasn't uploaded the right way. Maybe the session has expired" ); |
---|
517 | } |
---|
518 | } else { |
---|
519 | // If the file doesn't exist, we don't save any log file |
---|
520 | returnStructure.logfile = null |
---|
521 | } |
---|
522 | |
---|
523 | return returnStructure; |
---|
524 | } |
---|
525 | |
---|
526 | /* |
---|
527 | * Creates a histogram of sequence lengths for the given assaysamples |
---|
528 | * @param assaySamples List of assaySamples to create the histogram for |
---|
529 | * @return List List with length-number tuples. Example: |
---|
530 | * [ [400, 12], [402, 9] ] |
---|
531 | */ |
---|
532 | public List sequenceLengthHistogram( ArrayList assaySamples ) { |
---|
533 | if( !assaySamples || assaySamples.size() == 0 ) |
---|
534 | return []; |
---|
535 | |
---|
536 | // Determine the directory the uploaded files are stored in |
---|
537 | File permanentDirectory = fileService.absolutePath( ConfigurationHolder.config.massSequencing.fileDir ); |
---|
538 | |
---|
539 | // Initialize the histogram |
---|
540 | def histogram = [:]; |
---|
541 | |
---|
542 | // Loop through all samples and find the histogram values for each of them |
---|
543 | assaySamples.each { assaySample -> |
---|
544 | if( assaySample.numSequences() > 0 ) { |
---|
545 | assaySample.sequenceData.each { sequenceData -> |
---|
546 | File sequenceFile = fileService.get( sequenceData.sequenceFile, permanentDirectory ); |
---|
547 | |
---|
548 | parseFastaLengths( sequenceFile ).each { k, v -> |
---|
549 | histogram[ k ] = ( histogram[ k ] ?: 0 ) + v; |
---|
550 | }; |
---|
551 | } |
---|
552 | } |
---|
553 | } |
---|
554 | |
---|
555 | def lengthList = []; |
---|
556 | histogram.each { |
---|
557 | if( it.key != 'unknown' ) |
---|
558 | lengthList << [ it.key, it.value ] |
---|
559 | } |
---|
560 | |
---|
561 | return lengthList; |
---|
562 | } |
---|
563 | |
---|
564 | /** |
---|
565 | * Start the process of exporting fasta data in the background |
---|
566 | * |
---|
567 | * @param assaySamples List of assaysamples to be exported |
---|
568 | * @param httpSession Reference to the HTTP session |
---|
569 | * @param name Name of the exported filename |
---|
570 | * @return URL to redirect the user to, that shows information about the progress of the download |
---|
571 | */ |
---|
572 | public String startExportProcess( def assaySamples, def httpSession, String name, String returnUrl, String finishUrl ) { |
---|
573 | // Determine the total filesize to be processed. |
---|
574 | def permanentDir = fileService.absolutePath( ConfigurationHolder.config.massSequencing.fileDir ); |
---|
575 | def filesize = 0 |
---|
576 | assaySamples.each { assaySample -> |
---|
577 | assaySample.sequenceData.each { |
---|
578 | filesize += fileService.get( it.sequenceFile, permanentDir )?.size() ?: 0; |
---|
579 | if( it.qualityFile ) |
---|
580 | filesize += fileService.get( it.qualityFile, permanentDir )?.size() ?: 0; |
---|
581 | } |
---|
582 | }; |
---|
583 | |
---|
584 | // After the files are concatenated, the system has to retrieve data from GSCF. However, |
---|
585 | // we don't know how long that takes, but is it proportional to the number of samples |
---|
586 | // For that reason, we append a number to the total size, so the progress bar will keep |
---|
587 | // some space open while fetching GSCF data |
---|
588 | filesize += assaySamples.size() * AMOUNT_PER_ASSAYSAMPLE; |
---|
589 | |
---|
590 | // Empty the assaySample list, since the objects should be retrieved from the database again by the |
---|
591 | // initiated thread. Otherwise, hibernate session errors occur, since the other thread has a different |
---|
592 | // hibernate session |
---|
593 | def assaySampleIds = assaySamples.collect { it.id } |
---|
594 | assaySamples = null; |
---|
595 | |
---|
596 | // Create a worker screen to create the fasta file in the background |
---|
597 | def processId = workerService.initProcess( httpSession, "Creating your download", 1, filesize ) |
---|
598 | httpSession.process[ processId ][ "returnUrl" ] = returnUrl |
---|
599 | httpSession.process[ processId ].name = name; |
---|
600 | |
---|
601 | // Retrieve worker URL; returnUrl is the same as the errorUrl |
---|
602 | finishUrl = sprintf( finishUrl, processId ); |
---|
603 | def url = workerService.startProcess( httpSession, processId, finishUrl, returnUrl, returnUrl ) |
---|
604 | |
---|
605 | // Make sure the background process can send the progress to the HTTP session. |
---|
606 | def onProgress = { progress -> |
---|
607 | // Update progress |
---|
608 | httpSession.progress[ processId ].stepProgress = progress; |
---|
609 | } |
---|
610 | |
---|
611 | // Perform the real work in a background thread |
---|
612 | runAsync { |
---|
613 | def samples = assaySampleIds.collect { AssaySample.get( it ) } |
---|
614 | |
---|
615 | def filename = fileService.getUniqueFilename( "fastadownload" ); |
---|
616 | def file = fileService.get( filename ); |
---|
617 | |
---|
618 | try { |
---|
619 | export( samples, new FileOutputStream( file ), httpSession.sessionToken, onProgress ); |
---|
620 | } catch( Exception e ) { |
---|
621 | log.error( "Exception occurred during export of sequences: " ); |
---|
622 | e.printStackTrace(); |
---|
623 | |
---|
624 | httpSession.progress[ processId ].error = true; |
---|
625 | httpSession.progress[ processId ].errorMessage += e.getMessage(); |
---|
626 | } |
---|
627 | |
---|
628 | // Store the filename for later download |
---|
629 | httpSession.process[ processId ].filename = filename; |
---|
630 | |
---|
631 | // Tell the frontend we are finished |
---|
632 | httpSession.progress[ processId ].finished = true; |
---|
633 | } |
---|
634 | |
---|
635 | return url; |
---|
636 | } |
---|
637 | |
---|
638 | /** |
---|
639 | * Exports the fasta data of a list of assaysamples |
---|
640 | * @param assaySamples Assaysamples to export |
---|
641 | * @param outStream Outputstream to send the data to |
---|
642 | * @return |
---|
643 | */ |
---|
644 | public def export( List assaySamples, OutputStream outStream, def sessionToken, Closure onProgress, String name = null ) { |
---|
645 | if( !assaySamples || assaySamples.size() == 0 ) |
---|
646 | return false; |
---|
647 | |
---|
648 | // Retrieve the filename from configuration, if none is given |
---|
649 | if( !name ) |
---|
650 | name = ConfigurationHolder.config.massSequencing.exportFilename |
---|
651 | |
---|
652 | // Determine the directory the uploaded files are stored in |
---|
653 | File permanentDirectory = fileService.absolutePath( ConfigurationHolder.config.massSequencing.fileDir ); |
---|
654 | |
---|
655 | // First check whether qual files should be exported or not |
---|
656 | // It is only exported if qual scores are available for all sequences |
---|
657 | def exportQual = ( assaySamples*.numSequences().sum() == assaySamples*.numQualScores().sum() ); |
---|
658 | |
---|
659 | // First create tags for every sample |
---|
660 | def tags = []; |
---|
661 | |
---|
662 | // Determine new tag length. Since we can use 4 characters per |
---|
663 | // tag position, we only have to use 4log( #samples) |
---|
664 | // The minimum number of characters used is 10, to ensure the correct working of the other |
---|
665 | // programs. |
---|
666 | int tagLength = computeTagLength( assaySamples.size() ) |
---|
667 | int tagNumber = 0; |
---|
668 | |
---|
669 | assaySamples.each { assaySample -> |
---|
670 | if( assaySample.numSequences() > 0 ) { |
---|
671 | // Create a new tag for this assaysample |
---|
672 | def tag = createTag( tagLength, tagNumber++); |
---|
673 | |
---|
674 | // Save the tag for exporting |
---|
675 | tags << [ assaySampleId: assaySample.id, sampleName: assaySample.sample.name, |
---|
676 | assayName: assaySample.assay.name, studyName: assaySample.assay.study.name, |
---|
677 | forwardPrimer: assaySample.fwPrimerSeq, reversePrimer: assaySample.revPrimerSeq, |
---|
678 | tag: tag |
---|
679 | ]; |
---|
680 | } |
---|
681 | } |
---|
682 | |
---|
683 | // Now create zip file for fasta and qual files |
---|
684 | ZipOutputStream zipFile = new ZipOutputStream( new BufferedOutputStream( outStream ) ); |
---|
685 | BufferedWriter zipWriter = new BufferedWriter( new OutputStreamWriter( zipFile ) ); |
---|
686 | |
---|
687 | // Initialize the progress. This is the combined filesize of the sequence and quality files |
---|
688 | def progress = 0; |
---|
689 | |
---|
690 | // Create a reference to the assaySample we're processing here, in order to have it set if |
---|
691 | // an error occurs |
---|
692 | def assaySample |
---|
693 | |
---|
694 | // We have to loop twice through the sequenceData, since we can't write part of the sequence |
---|
695 | // file and part of the qual files mixed. We have to write the full sequence file first. |
---|
696 | try { |
---|
697 | zipFile.putNextEntry( new ZipEntry( name + ".fna" ) ); |
---|
698 | |
---|
699 | for( def i = 0; i < assaySamples.size(); i++ ) { |
---|
700 | assaySample = assaySamples[ i ]; |
---|
701 | |
---|
702 | if( assaySample.numSequences() > 0 ) { |
---|
703 | def currentTag = tags.find { it.assaySampleId == assaySample.id }; |
---|
704 | |
---|
705 | assaySample.sequenceData.each { sequenceData -> |
---|
706 | if( sequenceData && sequenceData.sequenceFile ) { |
---|
707 | def file = fileService.get( sequenceData.sequenceFile, permanentDirectory ); |
---|
708 | |
---|
709 | def begin = System.currentTimeMillis(); |
---|
710 | |
---|
711 | copyFastaFileForExport( file, currentTag.tag, zipWriter) |
---|
712 | |
---|
713 | log.trace "Exported FASTA: " + file + ": " + ( file.size() / 1000 ) + "kb / " + (System.currentTimeMillis() - begin ) + " ms"; |
---|
714 | |
---|
715 | // Update progress |
---|
716 | progress += file.size(); |
---|
717 | onProgress( progress ); |
---|
718 | } |
---|
719 | } |
---|
720 | } |
---|
721 | } |
---|
722 | zipWriter.flush(); |
---|
723 | zipFile.closeEntry(); |
---|
724 | |
---|
725 | if( exportQual ) { |
---|
726 | zipFile.putNextEntry( new ZipEntry( name + ".qual" ) ); |
---|
727 | |
---|
728 | for( def i = 0; i < assaySamples.size(); i++ ) { |
---|
729 | assaySample = assaySamples[ i ]; |
---|
730 | |
---|
731 | if( assaySample.numSequences() > 0 ) { |
---|
732 | def currentTag = tags.find { it.assaySampleId == assaySample.id }; |
---|
733 | |
---|
734 | assaySample.sequenceData.each { sequenceData -> |
---|
735 | if( sequenceData && sequenceData.sequenceFile && sequenceData.qualityFile ) { |
---|
736 | def file = fileService.get( sequenceData.qualityFile, permanentDirectory ); |
---|
737 | |
---|
738 | def begin = System.currentTimeMillis(); |
---|
739 | |
---|
740 | copyQualFileForExport( fileService.get( sequenceData.qualityFile, permanentDirectory ), currentTag.tag, zipWriter) |
---|
741 | |
---|
742 | log.trace "Exported QUAL: " + file + ": " + Math.round( file.size() / 1000 ) + "kb / " + (System.currentTimeMillis() - begin ) + " ms"; |
---|
743 | |
---|
744 | // Update progress |
---|
745 | progress += file.size(); |
---|
746 | onProgress( progress ); |
---|
747 | } |
---|
748 | } |
---|
749 | } |
---|
750 | } |
---|
751 | |
---|
752 | zipWriter.flush(); |
---|
753 | zipFile.closeEntry(); |
---|
754 | } else { |
---|
755 | // Update progress with the filesize of all qual files |
---|
756 | for( def i = 0; i < assaySamples.size(); i++ ) { |
---|
757 | assaySample = assaySamples[ i ]; |
---|
758 | |
---|
759 | if( assaySample.numSequences() > 0 ) { |
---|
760 | assaySample.sequenceData.each { sequenceData -> |
---|
761 | if( sequenceData && sequenceData.sequenceFile && sequenceData.qualityFile ) { |
---|
762 | def file = fileService.get( sequenceData.qualityFile, permanentDirectory ); |
---|
763 | |
---|
764 | // Update progress |
---|
765 | progress += file.size(); |
---|
766 | onProgress( progress ); |
---|
767 | } |
---|
768 | } |
---|
769 | } |
---|
770 | } |
---|
771 | } |
---|
772 | |
---|
773 | } catch( Exception e ) { |
---|
774 | log.error "Error while writing to fastafile or qualfile: " + e.getMessage(); |
---|
775 | |
---|
776 | // We shouldn't continue if anything went wrong |
---|
777 | throw new Exception( "Error while exporting data for sample " + assaySample.sample.name + ": " + e.getCause().getMessage() ); |
---|
778 | } finally { |
---|
779 | // Always close zip entry |
---|
780 | try { |
---|
781 | zipFile.closeEntry(); |
---|
782 | } catch( Exception e ) { |
---|
783 | log.error "Error while closing zip entry for fasta and qual: " + e.getMessage(); |
---|
784 | } |
---|
785 | } |
---|
786 | |
---|
787 | // Export a tab delimited file with tags |
---|
788 | zipFile.putNextEntry( new ZipEntry( name + ".tab" ) ); |
---|
789 | exportTabDelimitedSampleTagFile( tags, zipWriter ); |
---|
790 | zipWriter.flush(); |
---|
791 | zipFile.closeEntry(); |
---|
792 | |
---|
793 | // Export a mothur file with tags |
---|
794 | zipFile.putNextEntry( new ZipEntry( name + ".oligos" ) ); |
---|
795 | exportMothurSampleTagFile( tags, zipWriter ); |
---|
796 | zipWriter.flush(); |
---|
797 | zipFile.closeEntry(); |
---|
798 | |
---|
799 | // Export an excel file with information about the samples |
---|
800 | zipFile.putNextEntry( new ZipEntry( name + ".xls" ) ); |
---|
801 | sampleExcelService.sessionToken = sessionToken |
---|
802 | sampleExcelService.exportExcelSampleData( assaySamples, tags, zipFile ); |
---|
803 | zipFile.flush(); |
---|
804 | zipFile.closeEntry(); |
---|
805 | |
---|
806 | // After the files are concatenated, the system has to retrieve data from GSCF. However, |
---|
807 | // we don't know how long that takes, but is it proportional to the number of samples |
---|
808 | // For that reason, we append a number to the total size, so the progress bar will keep |
---|
809 | // some space open while fetching GSCF data |
---|
810 | onProgress( progress + assaySamples.size() * AMOUNT_PER_ASSAYSAMPLE ); |
---|
811 | |
---|
812 | // Export an excel file with information about the classification samples |
---|
813 | zipFile.putNextEntry( new ZipEntry( name + "classification.summary" ) ); |
---|
814 | classificationService.exportClassifications( assaySamples, zipFile ); |
---|
815 | zipFile.flush(); |
---|
816 | zipFile.closeEntry(); |
---|
817 | |
---|
818 | zipFile.close(); |
---|
819 | } |
---|
820 | |
---|
821 | /** |
---|
822 | * Creates an oligos file for Mothur that represents the connection between samples |
---|
823 | * and the artificial tags. |
---|
824 | * |
---|
825 | * @see http://www.mothur.org/wiki/Trim.seqs#allfiles |
---|
826 | * @param tags Map with newly created tags |
---|
827 | * @param zipWriter Writer to write the data to |
---|
828 | */ |
---|
829 | protected void exportMothurSampleTagFile( List tags, Writer zipWriter ) { |
---|
830 | // Add the forward and reverse primers, as found in the assaysamples |
---|
831 | // The primers are already cut off, so they are not relevant anymore |
---|
832 | // For that reason, a '#' is prepended to each line. |
---|
833 | def fwPrimers = tags.collect { it.forwardPrimer }.findAll { it }.unique(); |
---|
834 | def revPrimers = tags.collect { it.reversePrimer }.findAll { it }.unique(); |
---|
835 | |
---|
836 | fwPrimers.each { zipWriter.write( "#forward\t" + it + "\n" ) } |
---|
837 | revPrimers.each { zipWriter.write( "#reverse\t" + it + "\n" ) } |
---|
838 | |
---|
839 | // Check whether the sample names are unique. If they aren't, the assay and study names |
---|
840 | // are appended to the sample name |
---|
841 | def sampleNames = tags*.sampleNames; |
---|
842 | if( sampleNames.unique().size() < sampleNames.size() ) { |
---|
843 | tags.each { |
---|
844 | it.uniqueName = it.sampleName + " (" + it.assayName + " / " + it.studyName + ")"; |
---|
845 | } |
---|
846 | } else { |
---|
847 | tags.each { |
---|
848 | it.uniqueName = it.sampleName; |
---|
849 | } |
---|
850 | } |
---|
851 | |
---|
852 | tags.each { |
---|
853 | // Samples names are not allowed to contain spaces |
---|
854 | def name = it.uniqueName.replaceAll( /\s+/, '-' ); |
---|
855 | zipWriter.write( "barcode\t" + it.tag + "\t" + name + "\n" ); |
---|
856 | } |
---|
857 | } |
---|
858 | |
---|
859 | /** |
---|
860 | * Creates a tab delimited file with two columns and column headers "Sequence" and "Samplename" |
---|
861 | * @param tags Map with newly created tags |
---|
862 | * @param zipWriter Writer to write the data to |
---|
863 | */ |
---|
864 | protected void exportTabDelimitedSampleTagFile( List tags, Writer zipWriter ) { |
---|
865 | zipWriter.write( "Sequence" + "\t" + "Samplename" + "\n" ); |
---|
866 | |
---|
867 | // Check whether the sample names are unique. If they aren't, the assay and study names |
---|
868 | // are appended to the sample name |
---|
869 | def sampleNames = tags*.sampleNames; |
---|
870 | if( sampleNames.unique().size() < sampleNames.size() ) { |
---|
871 | tags.each { |
---|
872 | it.uniqueName = it.sampleName + " (" + it.assayName + " / " + it.studyName + ")"; |
---|
873 | } |
---|
874 | } else { |
---|
875 | tags.each { |
---|
876 | it.uniqueName = it.sampleName; |
---|
877 | } |
---|
878 | } |
---|
879 | |
---|
880 | tags.each { |
---|
881 | def name = it.uniqueName.replaceAll( /\s+/, '-' ); |
---|
882 | zipWriter.write( it.tag + "\t" + name + "\n" ); |
---|
883 | } |
---|
884 | } |
---|
885 | |
---|
886 | /** |
---|
887 | * Deletes all sequences from a list of assaysamples |
---|
888 | * @param assaySample |
---|
889 | * @return |
---|
890 | */ |
---|
891 | public int deleteSequenceData( ArrayList assaySamples ) { |
---|
892 | def numFiles = 0; |
---|
893 | assaySamples?.each { assaySample -> |
---|
894 | numFiles += assaySample.deleteSequenceData(); |
---|
895 | } |
---|
896 | |
---|
897 | // Update classification summary |
---|
898 | classificationService.updateClassificationForAssaySamples( assaySamples ); |
---|
899 | |
---|
900 | return numFiles; |
---|
901 | } |
---|
902 | |
---|
903 | /** |
---|
904 | * Creates a unique tag for the given number |
---|
905 | * @param length |
---|
906 | * @param tagNumber |
---|
907 | * @return |
---|
908 | */ |
---|
909 | public String createTag( int length, int tagNumber ) { |
---|
910 | def chars = ["C", "A", "G", "T"]; |
---|
911 | def numChars = chars.size(); |
---|
912 | |
---|
913 | // Determine whether the tagNumber is not too high |
---|
914 | if( tagNumber > numChars * ( ( numChars - 1 ) ** ( length - 1 ) ) ) |
---|
915 | throw new Exception( "Given tag number (" + tagNumber + ") is too large for the specified length (" + length + ")") |
---|
916 | |
---|
917 | String tag = ""; |
---|
918 | |
---|
919 | // Loop through all characters |
---|
920 | for( def i = 0; i < length; i++ ) { |
---|
921 | def currentChars |
---|
922 | |
---|
923 | // Make sure the characters to choose from are different from |
---|
924 | // the previous character |
---|
925 | if( i == 0 ) |
---|
926 | currentChars = chars; |
---|
927 | else |
---|
928 | currentChars = chars - tag[ 0 ] |
---|
929 | |
---|
930 | // Determine which character to append by taking the tagNumber modulo the number of characters |
---|
931 | int currentChar = tagNumber % currentChars.size() |
---|
932 | tag = currentChars[ currentChar ] + tag; |
---|
933 | |
---|
934 | tagNumber = Math.floor( tagNumber / currentChars.size() ); |
---|
935 | } |
---|
936 | |
---|
937 | return tag; |
---|
938 | } |
---|
939 | |
---|
940 | /** |
---|
941 | * Computes the number of characters needed for a tag to be unique for the given number of samples |
---|
942 | * @param numSamples Number of samples to generate tags for |
---|
943 | * @return Number of characters needed for a unique tag |
---|
944 | */ |
---|
945 | public int computeTagLength( int numSamples ) { |
---|
946 | // No homopolymers are allowed in the tag. For that reason we have |
---|
947 | // 4 possibilities for the first character, and 3 for every next character |
---|
948 | |
---|
949 | // In order to allow enough characters, we take the 3log(numSamples), with a |
---|
950 | // minimum of 10 |
---|
951 | return Math.max( 10, Math.ceil( Math.log( numSamples ) / Math.log( 3 ) ) ); |
---|
952 | } |
---|
953 | |
---|
954 | /** |
---|
955 | * Copies the contents of the given sequence file to the output file and prepends the tag to every sequences |
---|
956 | * @param inFile Filename of the file to be read |
---|
957 | * @param tag |
---|
958 | * @param outWriter |
---|
959 | * @return |
---|
960 | */ |
---|
961 | protected boolean copyFastaFileForExport( File inFile, String tag, BufferedWriter outWriter ) { |
---|
962 | // Walk through the lines in the file, starting with '>' |
---|
963 | // (and where the following line contains a character other than '>') |
---|
964 | |
---|
965 | try { |
---|
966 | BufferedReader inReader = new BufferedReader( new FileReader( inFile ) ); |
---|
967 | |
---|
968 | String line = null |
---|
969 | String newLine = "" |
---|
970 | String sequence = ""; |
---|
971 | |
---|
972 | def lengthPattern = ~/length=(\d+)/ |
---|
973 | def lengthMatches |
---|
974 | int length = 0; |
---|
975 | int tagLength = tag.size(); |
---|
976 | |
---|
977 | while( ( line = inReader.readLine()) != null) { |
---|
978 | if( line.size() == 0 ) { |
---|
979 | // Print the sequence we collected, before writing the empty line |
---|
980 | printSequence( outWriter, sequence, tag ); |
---|
981 | sequence = ""; |
---|
982 | |
---|
983 | // Empty line |
---|
984 | outWriter.newLine(); |
---|
985 | } else if( line[ 0 ] == '>' ) { |
---|
986 | // Print the sequence we collected, before writing the new comments tag |
---|
987 | printSequence( outWriter, sequence, tag ); |
---|
988 | sequence = ""; |
---|
989 | |
---|
990 | // Comments line: replace length=### with the |
---|
991 | // updated length, and put the line in the |
---|
992 | lengthMatches = ( line =~ lengthPattern ); |
---|
993 | if( lengthMatches ) { |
---|
994 | length = Integer.valueOf( lengthMatches[0][1] ) + tagLength; |
---|
995 | newLine = lengthMatches.replaceAll( "length=" + length ); |
---|
996 | } else { |
---|
997 | newLine = line; |
---|
998 | } |
---|
999 | |
---|
1000 | outWriter.write(newLine); |
---|
1001 | outWriter.newLine(); |
---|
1002 | } else { |
---|
1003 | // This is part of the sequence. We collect the whole sequence and |
---|
1004 | // determine in the end how to write it to the file |
---|
1005 | sequence += line; |
---|
1006 | } |
---|
1007 | } |
---|
1008 | |
---|
1009 | // Print the sequence we collected, before ending the file |
---|
1010 | printSequence( outWriter, sequence, tag ); |
---|
1011 | sequence = ""; |
---|
1012 | |
---|
1013 | } catch( Exception e ) { |
---|
1014 | log.error( "An error occurred while copying contents from " + inFile.getName() + ": " + e.getMessage() ); |
---|
1015 | |
---|
1016 | // Throw the exception, since the calling method should decide whether to continue or not |
---|
1017 | throw new Exception( "An error occurred while copying contents from FASTA file " + inFile.getName() + ": " + e.getMessage(), e ); |
---|
1018 | } |
---|
1019 | } |
---|
1020 | |
---|
1021 | /** |
---|
1022 | * Prints a sequence to the output file |
---|
1023 | * @param outWriter |
---|
1024 | * @param sequence |
---|
1025 | * @param tag |
---|
1026 | */ |
---|
1027 | private void printSequence( BufferedWriter outWriter, String sequence, String tag, int maxWidth = 60 ) { |
---|
1028 | // If no sequence is given, also don't prepend it with the tag |
---|
1029 | if( sequence.size() == 0 ) |
---|
1030 | return |
---|
1031 | |
---|
1032 | // Prepend the tag to the sequence |
---|
1033 | sequence = tag + sequence; |
---|
1034 | |
---|
1035 | // Write the sequence with a width of maxWidth characters per line |
---|
1036 | while( sequence ) { |
---|
1037 | if( sequence.size() > maxWidth ) { |
---|
1038 | outWriter.write( sequence[0..maxWidth-1] ); |
---|
1039 | sequence = sequence[maxWidth..-1] |
---|
1040 | } else { |
---|
1041 | outWriter.write( sequence ); |
---|
1042 | sequence = null; |
---|
1043 | } |
---|
1044 | outWriter.newLine(); |
---|
1045 | } |
---|
1046 | } |
---|
1047 | |
---|
1048 | /** |
---|
1049 | * Copies the contents of the given qual file to the output file and prepends the tag quality score to every sequence. |
---|
1050 | * For every tag character '40' is prepended to the qual scores |
---|
1051 | * |
---|
1052 | * @param inFile Filename of the file to be read |
---|
1053 | * @param tag |
---|
1054 | * @param outWriter |
---|
1055 | * @return |
---|
1056 | */ |
---|
1057 | protected boolean copyQualFileForExport( File inFile, String tag, BufferedWriter outWriter ) { |
---|
1058 | // Walk through the lines in the file, starting with '>' |
---|
1059 | // (and where the following line contains a character other than '>') |
---|
1060 | try { |
---|
1061 | BufferedReader inReader = new BufferedReader( new FileReader( inFile ) ); |
---|
1062 | |
---|
1063 | String line = null |
---|
1064 | String newLine = null |
---|
1065 | List<Integer> qualScores = [] |
---|
1066 | |
---|
1067 | def lengthPattern = ~/length=(\d+)/ |
---|
1068 | def lengthMatches |
---|
1069 | int length = 0; |
---|
1070 | int tagLength = tag.size(); |
---|
1071 | |
---|
1072 | while( ( line = inReader.readLine()) != null) { |
---|
1073 | if( line.size() == 0 ) { |
---|
1074 | // Print the quality scores we collected, before writing the empty line |
---|
1075 | printQualScore( outWriter, qualScores, tagLength ); |
---|
1076 | qualScores = []; |
---|
1077 | |
---|
1078 | // Empty line |
---|
1079 | outWriter.newLine(); |
---|
1080 | } else if( line[ 0 ] == '>' ) { |
---|
1081 | // Print the quality scores we collected, before writing the empty line |
---|
1082 | printQualScore( outWriter, qualScores, tagLength ); |
---|
1083 | qualScores = []; |
---|
1084 | |
---|
1085 | // Comments line: replace length=### with the |
---|
1086 | // updated length, and put the line in the |
---|
1087 | lengthMatches = ( line =~ lengthPattern ); |
---|
1088 | if( lengthMatches ) { |
---|
1089 | length = Integer.valueOf( lengthMatches[0][1] ) + tagLength; |
---|
1090 | newLine = lengthMatches.replaceAll( "length=" + length ); |
---|
1091 | } else { |
---|
1092 | newLine = line |
---|
1093 | } |
---|
1094 | |
---|
1095 | outWriter.write(newLine); |
---|
1096 | outWriter.newLine(); |
---|
1097 | } else { |
---|
1098 | // This is part of the quality score. We collect the whole set of quality |
---|
1099 | // scores and determine in the end how to write it to the file |
---|
1100 | qualScores += line.split( " " ).collect { |
---|
1101 | if( !it.isInteger() ) |
---|
1102 | return 0; |
---|
1103 | else |
---|
1104 | return Integer.parseInt( it ); |
---|
1105 | }; |
---|
1106 | } |
---|
1107 | } |
---|
1108 | |
---|
1109 | // Print the quality scores we collected, before ending the file |
---|
1110 | printQualScore( outWriter, qualScores, tagLength ); |
---|
1111 | qualScores = []; |
---|
1112 | |
---|
1113 | } catch( Exception e ) { |
---|
1114 | log.error( "An error occurred while copying contents from " + inFile.getName() + ": " + e.getMessage() ); |
---|
1115 | |
---|
1116 | // Throw the exception, since the calling method should decide whether to continue or not |
---|
1117 | throw new Exception( "An error occurred while copying contents from QUAL file " + inFile.getName() + ": " + e.getMessage(), e ); |
---|
1118 | } |
---|
1119 | } |
---|
1120 | |
---|
1121 | /** |
---|
1122 | * Prints a sequence to the output file |
---|
1123 | * @param outWriter |
---|
1124 | * @param sequence |
---|
1125 | * @param tag |
---|
1126 | */ |
---|
1127 | private void printQualScore( BufferedWriter outWriter, List<Integer> qualScores, int tagLength, int maxWidth = 60 ) { |
---|
1128 | // If no qualScores are given, also don't prepend it with the tag |
---|
1129 | if( qualScores.size() == 0 ) |
---|
1130 | return |
---|
1131 | |
---|
1132 | // Prepend the tag to the sequence |
---|
1133 | qualScores = Collections.nCopies( tagLength, 40 ) + qualScores; |
---|
1134 | |
---|
1135 | // Write the sequence with a width of maxWidth characters per line |
---|
1136 | while( qualScores ) { |
---|
1137 | if( qualScores.size() > maxWidth ) { |
---|
1138 | outWriter.write( qualScores[0..maxWidth-1].join( " " ) ); |
---|
1139 | qualScores = qualScores[maxWidth..-1] |
---|
1140 | } else { |
---|
1141 | outWriter.write( qualScores.join( " " ) ); |
---|
1142 | qualScores = null; |
---|
1143 | } |
---|
1144 | outWriter.newLine(); |
---|
1145 | } |
---|
1146 | } |
---|
1147 | |
---|
1148 | } |
---|