1 | package nl.tno.metagenomics |
---|
2 | |
---|
3 | import java.io.BufferedWriter; |
---|
4 | import java.io.File; |
---|
5 | import java.io.Writer; |
---|
6 | import java.util.ArrayList; |
---|
7 | import java.util.zip.ZipEntry |
---|
8 | import java.util.zip.ZipOutputStream |
---|
9 | import org.codehaus.groovy.grails.commons.ConfigurationHolder |
---|
10 | |
---|
11 | class FastaService { |
---|
12 | def fileService |
---|
13 | def fuzzySearchService |
---|
14 | def sampleExcelService |
---|
15 | |
---|
16 | static transactional = true |
---|
17 | |
---|
18 | /** |
---|
19 | * Parses uploaded files and checks them for FASTA and QUAL files |
---|
20 | * @param filenames List of filenames currently existing in the upload directory |
---|
21 | * @param onProgress Closure to execute when progress indicators should be updated. |
---|
22 | * Has 4 parameters: numFilesProcessed, numBytesProcessed that indicate the number |
---|
23 | * of files and bytes that have been processed in total. totalFiles, totalBytes indicate the change |
---|
24 | * in total number of files and bytes (e.g. should take 1 if a new file is added to the list) |
---|
25 | * @param directory Directory to move the files to |
---|
26 | * @return Structure with information about the parsed files. The 'success' files are |
---|
27 | * moved to the given directory |
---|
28 | * |
---|
29 | * [ |
---|
30 | * success: [ |
---|
31 | * [filename: 'abc.fasta', type: FASTA, numSequences: 190] |
---|
32 | * [filename: 'cde.fasta', type: FASTA, numSequences: 140] |
---|
33 | * [filename: 'abc.qual', type: QUAL, numSequences: 190, avgQuality: 38] |
---|
34 | * [filename: 'cde.qual', type: QUAL, numSequences: 140, avgQuality: 29] |
---|
35 | * ], |
---|
36 | * failure: [ |
---|
37 | * [filename: 'testing.xls', type: 'unknown', message: 'Type not recognized'] |
---|
38 | * ] |
---|
39 | * ] |
---|
40 | * |
---|
41 | */ |
---|
42 | def parseFiles( ArrayList filenames, Closure onProgress, File directory = null ) { |
---|
43 | if( filenames.size() == 0 ) { |
---|
44 | return [ success: [], failure: [] ]; |
---|
45 | } |
---|
46 | |
---|
47 | if( !directory ) { |
---|
48 | directory = fileService.absolutePath( ConfigurationHolder.config.metagenomics.fileDir ) |
---|
49 | } |
---|
50 | |
---|
51 | def success = []; |
---|
52 | def failure = []; |
---|
53 | |
---|
54 | long filesProcessed = 0; |
---|
55 | long bytesProcessed = 0; |
---|
56 | |
---|
57 | // Loop through all filenames |
---|
58 | for( int i = 0; i < filenames.size(); i++ ) { |
---|
59 | def filename = filenames[ i ]; |
---|
60 | |
---|
61 | if( fileService.isZipFile( filename ) ) { |
---|
62 | // ZIP files are extracted and appended to the filenames list. |
---|
63 | def newfiles = fileService.extractZipFile( filename, { files, bytes, totalFiles, totalBytes -> |
---|
64 | filesProcessed += files; |
---|
65 | bytesProcessed += bytes; |
---|
66 | |
---|
67 | onProgress( filesProcessed, bytesProcessed, totalFiles, totalBytes ); |
---|
68 | } ); |
---|
69 | if( newfiles ) { |
---|
70 | newfiles.each { |
---|
71 | filenames.add( it ); |
---|
72 | } |
---|
73 | } |
---|
74 | } else { |
---|
75 | def file = fileService.get( filename ); |
---|
76 | String filetype = fileService.determineFileType( file ); |
---|
77 | |
---|
78 | if( !fileTypeValid( filetype ) ) { |
---|
79 | // If files are not valid for parsing, delete them and return a message to the user |
---|
80 | fileService.delete(filename); |
---|
81 | failure << [ filename: filename, originalfilename: fileService.originalFilename( filename ), type: filetype, message: 'File type not accepted' ]; |
---|
82 | } else { |
---|
83 | try { |
---|
84 | def contents = parseFile( file, filetype, { files, bytes -> |
---|
85 | filesProcessed += files; |
---|
86 | bytesProcessed += bytes; |
---|
87 | |
---|
88 | onProgress( filesProcessed, bytesProcessed, 0, 0 ); |
---|
89 | } ); |
---|
90 | |
---|
91 | contents.filename = file.getName(); |
---|
92 | contents.originalfilename = fileService.originalFilename( contents.filename ) |
---|
93 | |
---|
94 | if( contents.success ) { |
---|
95 | success << contents; |
---|
96 | } else { |
---|
97 | fileService.delete(filename); |
---|
98 | failure << contents; |
---|
99 | } |
---|
100 | } catch( Exception e ) { |
---|
101 | // If anything fails during parsing, return an error message |
---|
102 | fileService.delete(filename); |
---|
103 | failure << [ filename: filename, originalfilename: fileService.originalFilename( filename ), type: filetype, message: e.getMessage() ]; |
---|
104 | } |
---|
105 | } |
---|
106 | } |
---|
107 | } |
---|
108 | |
---|
109 | return [ success: success, failure: failure ]; |
---|
110 | } |
---|
111 | |
---|
112 | /** |
---|
113 | * Matches uploaded fasta and qual files and combines them with the samples they probably belong to |
---|
114 | * @param parsedFiles Parsed files |
---|
115 | * [ |
---|
116 | * [filename: 'abc.fasta', type: FASTA, numSequences: 190] |
---|
117 | * [filename: 'cde.fasta', type: FASTA, numSequences: 140] |
---|
118 | * [filename: 'abc.qual', type: QUAL, numSequences: 190, avgQuality: 38] |
---|
119 | * [filename: 'cde.qual', type: QUAL, numSequences: 140, avgQuality: 29] |
---|
120 | * ] |
---|
121 | * @param samples AssaySample objects to which the files should be matched. |
---|
122 | * @return Structure with information about the matching. |
---|
123 | * [ |
---|
124 | * [ |
---|
125 | * fasta: [filename: 'abc.fasta', type: FASTA, numSequences: 190], |
---|
126 | * qual: [filename: 'abc.qual', type: QUAL, numSequences: 190, avgQuality: 38], |
---|
127 | * feasibleQuals: [ |
---|
128 | * [filename: 'abc.qual', type: QUAL, numSequences: 190, avgQuality: 38], |
---|
129 | * [filename: 'def.qual', type: QUAL, numSequences: 190, avgQuality: 21] |
---|
130 | * ] |
---|
131 | * sample: AssaySample object |
---|
132 | */ |
---|
133 | def matchFiles( def parsedFiles, def samples ) { |
---|
134 | def fastas = parsedFiles.findAll { it.type == "fasta" } |
---|
135 | def quals = parsedFiles.findAll { it.type == "qual" } |
---|
136 | samples = samples.toList() |
---|
137 | |
---|
138 | def files = []; |
---|
139 | |
---|
140 | fastas.each { fastaFile -> |
---|
141 | // Remove extension |
---|
142 | def matchWith = fastaFile.originalfilename.substring( 0, fastaFile.originalfilename.lastIndexOf( '.' ) ) |
---|
143 | |
---|
144 | // Determine feasible quals (based on number of sequences ) |
---|
145 | def feasibleQuals = quals.findAll { it.numSequences == fastaFile.numSequences } |
---|
146 | |
---|
147 | // Best matching qual file |
---|
148 | def qualIdx = fuzzySearchService.mostSimilarWithIndex( matchWith + '.qual', feasibleQuals.originalfilename ); |
---|
149 | |
---|
150 | def qual = null |
---|
151 | if( qualIdx != null ) |
---|
152 | qual = feasibleQuals[ qualIdx ]; |
---|
153 | |
---|
154 | // Best matching sample |
---|
155 | // TODO: Implement method to search for sample matches in a provided excel sheet |
---|
156 | def sampleIdx = fuzzySearchService.mostSimilarWithIndex( matchWith, samples.sample.name ); |
---|
157 | def assaySample = null |
---|
158 | if( sampleIdx != null ) { |
---|
159 | assaySample = samples[ sampleIdx ]; |
---|
160 | } |
---|
161 | |
---|
162 | files << [ |
---|
163 | fasta: fastaFile, |
---|
164 | feasibleQuals: feasibleQuals, |
---|
165 | qual: qual, |
---|
166 | assaySample: assaySample |
---|
167 | ] |
---|
168 | } |
---|
169 | |
---|
170 | return files; |
---|
171 | |
---|
172 | |
---|
173 | } |
---|
174 | |
---|
175 | /** |
---|
176 | * Determines whether a file can be processed. |
---|
177 | * @param filetype Filetype of the file |
---|
178 | * @see determineFileType() |
---|
179 | * @return |
---|
180 | */ |
---|
181 | protected boolean fileTypeValid( String filetype ) { |
---|
182 | switch( filetype ) { |
---|
183 | case "fasta": |
---|
184 | case "qual": |
---|
185 | return true; |
---|
186 | default: |
---|
187 | return false; |
---|
188 | } |
---|
189 | } |
---|
190 | |
---|
191 | /** |
---|
192 | * Parses the given file |
---|
193 | * @param file File to parse |
---|
194 | * @param filetype Type of the given file |
---|
195 | * @param onProgress Closure to execute when progress indicators should be updated. |
---|
196 | * Has 2 parameters: numFilesProcessed and numBytesProcessed that indicate the number |
---|
197 | * of files and bytes that have been processed in this file (so the first parameter should |
---|
198 | * only be 1 when the file is finished) |
---|
199 | * |
---|
200 | * @return List structure. Examples: |
---|
201 | * |
---|
202 | * [ success: true, filename: 'abc.fasta', type: 'fasta', numSequences: 200 ] |
---|
203 | * [ success: true, filename: 'abc.qual', type: 'qual', numSequences: 200, avgQuality: 36 ] |
---|
204 | * [ success: false, filename: 'abc.txt', type: 'txt', message: 'Filetype could not be parsed.' ] |
---|
205 | */ |
---|
206 | protected def parseFile( File file, String filetype, Closure onProgress ) { |
---|
207 | switch( filetype ) { |
---|
208 | case "fasta": |
---|
209 | return parseFasta( file, onProgress ); |
---|
210 | case "qual": |
---|
211 | return parseQual( file, onProgress ); |
---|
212 | default: |
---|
213 | onProgress( 1, file.length() ); |
---|
214 | return [ success: false, type: filetype, message: 'Filetype could not be parsed.' ] |
---|
215 | } |
---|
216 | } |
---|
217 | |
---|
218 | /** |
---|
219 | * Parses the given FASTA file |
---|
220 | * @param file File to parse |
---|
221 | * @param onProgress Closure to execute when progress indicators should be updated. |
---|
222 | * Has 2 parameters: numFilesProcessed and numBytesProcessed that indicate the number |
---|
223 | * of files and bytes that have been processed in this file (so the first parameter should |
---|
224 | * only be 1 when the file is finished) |
---|
225 | * @return List structure. Examples: |
---|
226 | * |
---|
227 | * [ success: true, filename: 'abc.fasta', type: 'fasta', numSequences: 200 ] |
---|
228 | * [ success: false, filename: 'def.fasta', type: 'fasta', message: 'File is not a valid FASTA file' ] |
---|
229 | */ |
---|
230 | protected def parseFasta( File file, Closure onProgress ) { |
---|
231 | |
---|
232 | long startTime = System.nanoTime(); |
---|
233 | log.trace "Start parsing FASTA " + file.getName() |
---|
234 | |
---|
235 | // Count the number of lines, starting with '>' (and where the following line contains a character other than '>') |
---|
236 | long numSequences = 0; |
---|
237 | long bytesProcessed = 0; |
---|
238 | boolean lookingForSequence = false; |
---|
239 | |
---|
240 | file.eachLine { line -> |
---|
241 | if( line ) { |
---|
242 | if( !lookingForSequence && line[0] == '>' ) { |
---|
243 | lookingForSequence = true; |
---|
244 | } else if( lookingForSequence ) { |
---|
245 | if( line[0] != '>' ) { |
---|
246 | numSequences++; |
---|
247 | lookingForSequence = false; |
---|
248 | } |
---|
249 | } |
---|
250 | |
---|
251 | |
---|
252 | // Update progress every time 1MB is processed |
---|
253 | bytesProcessed += line.size(); |
---|
254 | if( bytesProcessed > 1000000 ) { |
---|
255 | onProgress( 0, bytesProcessed ); |
---|
256 | bytesProcessed = 0; |
---|
257 | } |
---|
258 | } |
---|
259 | |
---|
260 | } |
---|
261 | |
---|
262 | // Update progress and say we're finished |
---|
263 | onProgress( 1, bytesProcessed ); |
---|
264 | |
---|
265 | log.trace "Finished parsing FASTA " + file.getName() + ": " + ( System.nanoTime() - startTime ) / 1000000L |
---|
266 | |
---|
267 | return [ success: true, type: "fasta", numSequences: numSequences ]; |
---|
268 | } |
---|
269 | |
---|
270 | /** |
---|
271 | * Parses the given QUAL file |
---|
272 | * @param file File to parse |
---|
273 | * @param onProgress Closure to execute when progress indicators should be updated. |
---|
274 | * Has 2 parameters: numFilesProcessed and numBytesProcessed that indicate the number |
---|
275 | * of files and bytes that have been processed in this file (so the first parameter should |
---|
276 | * only be 1 when the file is finished) |
---|
277 | * @return List structure. Examples: |
---|
278 | * |
---|
279 | * [ success: true, filename: 'abc.qual', type: 'qual', numSequences: 200, avgQuality: 31 ] |
---|
280 | * [ success: false, filename: 'def.qual', type: 'qual', message: 'File is not a valid QUAL file' ] |
---|
281 | */ |
---|
282 | protected def parseQual( File file, Closure onProgress ) { |
---|
283 | long startTime = System.nanoTime(); |
---|
284 | log.trace "Start parsing QUAL " + file.getName() |
---|
285 | |
---|
286 | // Count the number of lines, starting with '>'. After we've found such a character, we continue looking for |
---|
287 | // quality scores |
---|
288 | long numSequences = 0; |
---|
289 | long bytesProcessed = 0; |
---|
290 | def quality = [ quality: 0.0, number: 0L ] |
---|
291 | |
---|
292 | boolean lookingForFirstQualityScores = false; |
---|
293 | file.eachLine { line -> |
---|
294 | if( line ) { |
---|
295 | if( !lookingForFirstQualityScores && line[0] == '>' ) { |
---|
296 | lookingForFirstQualityScores = true; |
---|
297 | } else if( lookingForFirstQualityScores ) { |
---|
298 | if( line[0] != '>' ) { |
---|
299 | numSequences++; |
---|
300 | lookingForFirstQualityScores = false; |
---|
301 | |
---|
302 | // Don't compute average quality because it takes too much time |
---|
303 | //quality = updateQuality( quality, line ); |
---|
304 | } |
---|
305 | } else { |
---|
306 | // Don't compute average quality because it takes too much time |
---|
307 | //quality = updateQuality( quality, line ); |
---|
308 | } |
---|
309 | |
---|
310 | // Update progress every time 1MB is processed |
---|
311 | bytesProcessed += line.size(); |
---|
312 | if( bytesProcessed > 1000000 ) { |
---|
313 | onProgress( 0, bytesProcessed ); |
---|
314 | bytesProcessed = 0; |
---|
315 | } |
---|
316 | } |
---|
317 | |
---|
318 | } |
---|
319 | |
---|
320 | // Update progress and say we're finished |
---|
321 | onProgress( 1, bytesProcessed ); |
---|
322 | |
---|
323 | log.trace "Finished parsing QUAL " + file.getName() + ": " + ( System.nanoTime() - startTime ) / 1000000L |
---|
324 | |
---|
325 | return [ success: true, type: "qual", numSequences: numSequences, avgQuality: quality.quality ]; |
---|
326 | } |
---|
327 | |
---|
328 | /** |
---|
329 | * Parses the given line and updates the average quality based on the scores |
---|
330 | * @param quality [quality: 0.0f, number: 0L] |
---|
331 | * @param line String of integer quality scores, separated by a whitespace |
---|
332 | * @return [quality: 0.0f, number: 0L] |
---|
333 | */ |
---|
334 | protected def updateQuality( def quality, String line ) { |
---|
335 | // Determine current average |
---|
336 | List tokens = line.tokenize(); |
---|
337 | Long total = 0; |
---|
338 | |
---|
339 | tokens.each { |
---|
340 | total += Integer.parseInt( it ); |
---|
341 | } |
---|
342 | |
---|
343 | int numTokens = tokens.size(); |
---|
344 | |
---|
345 | // Update the given average |
---|
346 | if( numTokens > 0 ) { |
---|
347 | quality.number += numTokens; |
---|
348 | quality.quality = quality.quality + ( ( total / numTokens as double ) - quality.quality ) / quality.number * numTokens; |
---|
349 | } |
---|
350 | |
---|
351 | return quality |
---|
352 | } |
---|
353 | |
---|
354 | /** |
---|
355 | * Moves a fasta and qual file to their permanent location, and returns information about these files |
---|
356 | * |
---|
357 | * @param fastaFile Filename of the fasta file in the temporary directory |
---|
358 | * @param qualFile Filename of the fasta file in the temporary directory |
---|
359 | * @param processedFiles Structure with data about uploaded files and matches |
---|
360 | * @return [ fasta: <fasta filename>, qual: <qual filename>, numSequences: <number of sequences>, avgQuality: <average quality> ] |
---|
361 | */ |
---|
362 | public def savePermanent( String fastaFile, String qualFile, def processedFiles ) { |
---|
363 | File permanentDirectory = fileService.absolutePath( ConfigurationHolder.config.metagenomics.fileDir ); |
---|
364 | def returnStructure = [:]; |
---|
365 | |
---|
366 | if( fileService.fileExists( fastaFile ) ) { |
---|
367 | // Lookup the original filename |
---|
368 | def fastaData = processedFiles.parsed.success.find { it.filename == fastaFile }; |
---|
369 | if( fastaData ) { |
---|
370 | returnStructure.fasta = fileService.moveFileToUploadDir( fileService.get( fastaFile ), fastaData.originalfilename, permanentDirectory ); |
---|
371 | returnStructure.numSequences = fastaData.numSequences; |
---|
372 | } else { |
---|
373 | throw new Exception( "Fasta file wasn't uploaded the right way. Maybe the session has expired/" ); |
---|
374 | } |
---|
375 | } else { |
---|
376 | // If the file doesn't exist, we can't save anything to the database. |
---|
377 | throw new Exception( "Fasta file to save doesn't exist on disk" ); |
---|
378 | } |
---|
379 | |
---|
380 | if( qualFile && fileService.fileExists( qualFile ) ) { |
---|
381 | // Lookup the original filename |
---|
382 | def qualData = processedFiles.parsed.success.find { it.filename == qualFile }; |
---|
383 | |
---|
384 | if( qualData ) { |
---|
385 | returnStructure.qual = fileService.moveFileToUploadDir( fileService.get(qualFile ), qualData.originalfilename, permanentDirectory ); |
---|
386 | returnStructure.avgQuality = qualData.avgQuality |
---|
387 | } else { |
---|
388 | // Delete the uploaded fasta file, since this is a serious error |
---|
389 | fileService.delete( returnStructure.fasta, permanentDirectory ); |
---|
390 | throw new Exception( "Qual file wasn't uploaded the right way. Maybe the session has expired" ); |
---|
391 | } |
---|
392 | } else { |
---|
393 | // If the file doesn't exist, we don't save any quality information |
---|
394 | returnStructure.qual = null |
---|
395 | returnStructure.avgQuality = 0; |
---|
396 | } |
---|
397 | |
---|
398 | return returnStructure; |
---|
399 | } |
---|
400 | |
---|
401 | /** |
---|
402 | * Exports the fasta data of a list of assaysamples |
---|
403 | * @param assaySamples Assaysamples to export |
---|
404 | * @param outStream Outputstream to send the data to |
---|
405 | * @return |
---|
406 | */ |
---|
407 | public def export( List assaySamples, OutputStream outStream, String name ) { |
---|
408 | if( !assaySamples || assaySamples.size() == 0 ) |
---|
409 | return false; |
---|
410 | |
---|
411 | // Determine the directory the uploaded files are stored in |
---|
412 | File permanentDirectory = fileService.absolutePath( ConfigurationHolder.config.metagenomics.fileDir ); |
---|
413 | |
---|
414 | // First check whether qual files should be exported or not |
---|
415 | // It is only exported if qual scores are available for all sequences |
---|
416 | def exportQual = ( assaySamples*.numSequences().sum() == assaySamples*.numQualScores().sum() ); |
---|
417 | |
---|
418 | // First create tags for every sample |
---|
419 | def tags = []; |
---|
420 | |
---|
421 | // Determine new tag length. Since we can use 4 characters per |
---|
422 | // tag position, we only have to use 4log( #samples) |
---|
423 | int tagLength = Math.ceil( Math.log( assaySamples.size() ) / Math.log( 4 ) ) |
---|
424 | int tagNumber = 0; |
---|
425 | |
---|
426 | assaySamples.each { assaySample -> |
---|
427 | if( assaySample.numSequences() > 0 ) { |
---|
428 | // Create a new tag for this assaysample |
---|
429 | def tag = createTag( tagLength, tagNumber++); |
---|
430 | |
---|
431 | // Save the tag for exporting |
---|
432 | tags << [assaySampleId: assaySample.id, sampleName: assaySample.sample.name, assayName: assaySample.assay.name, studyName: assaySample.assay.study.name, tag: tag] |
---|
433 | } |
---|
434 | } |
---|
435 | |
---|
436 | // Now create zip file for fasta and qual files |
---|
437 | ZipOutputStream zipFile = new ZipOutputStream( new BufferedOutputStream( outStream ) ); |
---|
438 | BufferedWriter zipWriter = new BufferedWriter( new OutputStreamWriter( zipFile ) ); |
---|
439 | |
---|
440 | // We have to loop twice through the sequenceData, since we can't write part of the sequence |
---|
441 | // file and part of the qual files mixed. We have to write the full sequence file first. |
---|
442 | try { |
---|
443 | zipFile.putNextEntry( new ZipEntry( name + ".fasta" ) ); |
---|
444 | |
---|
445 | assaySamples.each { assaySample -> |
---|
446 | if( assaySample.numSequences() > 0 ) { |
---|
447 | def currentTag = tags.find { it.assaySampleId == assaySample.id }; |
---|
448 | |
---|
449 | assaySample.sequenceData.each { sequenceData -> |
---|
450 | copyFastaFileForExport( fileService.get( sequenceData.sequenceFile, permanentDirectory ), currentTag.tag, zipWriter) |
---|
451 | } |
---|
452 | } |
---|
453 | } |
---|
454 | zipWriter.flush(); |
---|
455 | zipFile.closeEntry(); |
---|
456 | |
---|
457 | if( exportQual ) { |
---|
458 | zipFile.putNextEntry( new ZipEntry( name + ".qual" ) ); |
---|
459 | |
---|
460 | assaySamples.each { assaySample -> |
---|
461 | if( assaySample.numSequences() > 0 ) { |
---|
462 | def currentTag = tags.find { it.assaySampleId == assaySample.id }; |
---|
463 | |
---|
464 | assaySample.sequenceData.each { sequenceData -> |
---|
465 | copyQualFileForExport( fileService.get( sequenceData.qualityFile, permanentDirectory ), currentTag.tag, zipWriter) |
---|
466 | } |
---|
467 | } |
---|
468 | } |
---|
469 | |
---|
470 | zipWriter.flush(); |
---|
471 | zipFile.closeEntry(); |
---|
472 | } |
---|
473 | |
---|
474 | } catch( Exception e ) { |
---|
475 | log.error "Error while writing to fastafile or qualfile: " + e.getMessage(); |
---|
476 | } finally { |
---|
477 | // Always close zip entry |
---|
478 | try { |
---|
479 | zipFile.closeEntry(); |
---|
480 | } catch( Exception e ) { |
---|
481 | log.error "Error while closing zip entry for fasta and qual: " + e.getMessage(); |
---|
482 | } |
---|
483 | } |
---|
484 | |
---|
485 | // Export a tab delimited file with tags |
---|
486 | zipFile.putNextEntry( new ZipEntry( name + "_sample-tag.tab" ) ); |
---|
487 | exportSampleTagFile( tags, zipWriter ); |
---|
488 | zipWriter.flush(); |
---|
489 | zipFile.closeEntry(); |
---|
490 | |
---|
491 | // Export an excel file with information about the samples |
---|
492 | zipFile.putNextEntry( new ZipEntry( name + ".xls" ) ); |
---|
493 | sampleExcelService.exportExcelSampleData( assaySamples, tags, zipFile ); |
---|
494 | zipFile.closeEntry(); |
---|
495 | |
---|
496 | zipFile.close(); |
---|
497 | } |
---|
498 | |
---|
499 | /** |
---|
500 | * Creates a tab delimited file with two columns and column headers "Sequence" and "Samplename" |
---|
501 | * @param tags Map with newly created tags |
---|
502 | * @param zipWriter Writer to write the data to |
---|
503 | */ |
---|
504 | protected void exportSampleTagFile( List tags, Writer zipWriter ) { |
---|
505 | zipWriter.write( "Sequence" + "\t" + "Samplename" + "\n" ); |
---|
506 | |
---|
507 | // Check whether the sample names are unique. If they aren't, the assay and study names |
---|
508 | // are appended to the sample name |
---|
509 | def sampleNames = tags*.sampleNames; |
---|
510 | if( sampleNames.unique().size() < sampleNames.size() ) { |
---|
511 | tags.each { |
---|
512 | it.uniqueName = it.sampleName + " (" + it.assayName + " / " + it.studyName + ")"; |
---|
513 | } |
---|
514 | } else { |
---|
515 | tags.each { |
---|
516 | it.uniqueName = it.sampleName; |
---|
517 | } |
---|
518 | } |
---|
519 | |
---|
520 | tags.each { |
---|
521 | zipWriter.write( it.tag + "\t" + it.uniqueName + "\n" ); |
---|
522 | } |
---|
523 | } |
---|
524 | |
---|
525 | /** |
---|
526 | * Creates a unique tag for the given number |
---|
527 | * @param length |
---|
528 | * @param tagNumber |
---|
529 | * @return |
---|
530 | */ |
---|
531 | public String createTag( int length, int tagNumber ) { |
---|
532 | def chars = ["C", "A", "G", "T"]; |
---|
533 | def numChars = chars.size(); |
---|
534 | |
---|
535 | if( tagNumber > numChars ** length ) |
---|
536 | throw new Exception( "Given tag number (" + tagNumber + ") is too large for the specified length (" + length + ")") |
---|
537 | |
---|
538 | String tag = ""; |
---|
539 | |
---|
540 | for( def i = 0; i < length; i++ ) { |
---|
541 | int currentChar = tagNumber % numChars |
---|
542 | |
---|
543 | tag = chars[ currentChar ] + tag; |
---|
544 | |
---|
545 | tagNumber = Math.floor( tagNumber / numChars ); |
---|
546 | } |
---|
547 | |
---|
548 | return tag |
---|
549 | } |
---|
550 | |
---|
551 | /** |
---|
552 | * Copies the contents of the given sequence file to the output file and prepends the tag to every sequences |
---|
553 | * @param inFile Filename of the file to be read |
---|
554 | * @param tag |
---|
555 | * @param outWriter |
---|
556 | * @return |
---|
557 | */ |
---|
558 | protected boolean copyFastaFileForExport( File inFile, String tag, BufferedWriter outWriter ) { |
---|
559 | // Walk through the lines in the file, starting with '>' |
---|
560 | // (and where the following line contains a character other than '>') |
---|
561 | |
---|
562 | try { |
---|
563 | BufferedReader inReader = new BufferedReader( new FileReader( inFile ) ); |
---|
564 | |
---|
565 | String line = null |
---|
566 | String newLine = null |
---|
567 | String sequence = ""; |
---|
568 | |
---|
569 | def lengthPattern = ~/length=(\d+)/ |
---|
570 | def lengthMatches |
---|
571 | int length = 0; |
---|
572 | int tagLength = tag.size(); |
---|
573 | |
---|
574 | while( ( line = inReader.readLine()) != null) { |
---|
575 | if( line.size() == 0 ) { |
---|
576 | // Print the sequence we collected, before writing the empty line |
---|
577 | printSequence( outWriter, sequence, tag ); |
---|
578 | sequence = ""; |
---|
579 | |
---|
580 | // Empty line |
---|
581 | outWriter.newLine(); |
---|
582 | } else if( line[ 0 ] == '>' ) { |
---|
583 | // Print the sequence we collected, before writing the new comments tag |
---|
584 | printSequence( outWriter, sequence, tag ); |
---|
585 | sequence = ""; |
---|
586 | |
---|
587 | // Comments line: replace length=### with the |
---|
588 | // updated length, and put the line in the |
---|
589 | lengthMatches = ( line =~ lengthPattern ); |
---|
590 | if( lengthMatches ) { |
---|
591 | length = Integer.valueOf( lengthMatches[0][1] ) + tagLength; |
---|
592 | newLine = lengthMatches.replaceAll( "length=" + length ); |
---|
593 | } |
---|
594 | |
---|
595 | outWriter.write(newLine); |
---|
596 | outWriter.newLine(); |
---|
597 | } else { |
---|
598 | // This is part of the sequence. We collect the whole sequence and |
---|
599 | // determine in the end how to write it to the file |
---|
600 | sequence += line; |
---|
601 | } |
---|
602 | } |
---|
603 | |
---|
604 | // Print the sequence we collected, before ending the file |
---|
605 | printSequence( outWriter, sequence, tag ); |
---|
606 | sequence = ""; |
---|
607 | |
---|
608 | } catch( Exception e ) { |
---|
609 | log.error( "An error occurred while copying contents from " + inFile.getName() + ": " + e.getMessage() ); |
---|
610 | return false; |
---|
611 | } |
---|
612 | } |
---|
613 | |
---|
614 | /** |
---|
615 | * Prints a sequence to the output file |
---|
616 | * @param outWriter |
---|
617 | * @param sequence |
---|
618 | * @param tag |
---|
619 | */ |
---|
620 | private void printSequence( BufferedWriter outWriter, String sequence, String tag, int maxWidth = 60 ) { |
---|
621 | // If no sequence is given, also don't prepend it with the tag |
---|
622 | if( sequence.size() == 0 ) |
---|
623 | return |
---|
624 | |
---|
625 | // Prepend the tag to the sequence |
---|
626 | sequence = tag + sequence; |
---|
627 | |
---|
628 | // Write the sequence with a width of maxWidth characters per line |
---|
629 | while( sequence ) { |
---|
630 | if( sequence.size() > maxWidth ) { |
---|
631 | outWriter.write( sequence[0..maxWidth-1] ); |
---|
632 | sequence = sequence[maxWidth..-1] |
---|
633 | } else { |
---|
634 | outWriter.write( sequence ); |
---|
635 | sequence = null; |
---|
636 | } |
---|
637 | outWriter.newLine(); |
---|
638 | } |
---|
639 | } |
---|
640 | |
---|
641 | /** |
---|
642 | * Copies the contents of the given qual file to the output file and prepends the tag quality score to every sequence. |
---|
643 | * For every tag character '40' is prepended to the qual scores |
---|
644 | * |
---|
645 | * @param inFile Filename of the file to be read |
---|
646 | * @param tag |
---|
647 | * @param outWriter |
---|
648 | * @return |
---|
649 | */ |
---|
650 | protected boolean copyQualFileForExport( File inFile, String tag, BufferedWriter outWriter ) { |
---|
651 | // Walk through the lines in the file, starting with '>' |
---|
652 | // (and where the following line contains a character other than '>') |
---|
653 | try { |
---|
654 | BufferedReader inReader = new BufferedReader( new FileReader( inFile ) ); |
---|
655 | |
---|
656 | String line = null |
---|
657 | String newLine = null |
---|
658 | List<Integer> qualScores = [] |
---|
659 | |
---|
660 | def lengthPattern = ~/length=(\d+)/ |
---|
661 | def lengthMatches |
---|
662 | int length = 0; |
---|
663 | int tagLength = tag.size(); |
---|
664 | |
---|
665 | while( ( line = inReader.readLine()) != null) { |
---|
666 | if( line.size() == 0 ) { |
---|
667 | // Print the quality scores we collected, before writing the empty line |
---|
668 | printQualScore( outWriter, qualScores, tagLength ); |
---|
669 | qualScores = []; |
---|
670 | |
---|
671 | // Empty line |
---|
672 | outWriter.newLine(); |
---|
673 | } else if( line[ 0 ] == '>' ) { |
---|
674 | // Print the quality scores we collected, before writing the empty line |
---|
675 | printQualScore( outWriter, qualScores, tagLength ); |
---|
676 | qualScores = []; |
---|
677 | |
---|
678 | // Comments line: replace length=### with the |
---|
679 | // updated length, and put the line in the |
---|
680 | lengthMatches = ( line =~ lengthPattern ); |
---|
681 | if( lengthMatches ) { |
---|
682 | length = Integer.valueOf( lengthMatches[0][1] ) + tagLength; |
---|
683 | newLine = lengthMatches.replaceAll( "length=" + length ); |
---|
684 | } |
---|
685 | |
---|
686 | outWriter.write(newLine); |
---|
687 | outWriter.newLine(); |
---|
688 | } else { |
---|
689 | // This is part of the quality score. We collect the whole set of quality |
---|
690 | // scores and determine in the end how to write it to the file |
---|
691 | qualScores += line.split( " " ).collect { |
---|
692 | if( !it.isInteger() ) |
---|
693 | return 0; |
---|
694 | else |
---|
695 | return Integer.parseInt( it ); |
---|
696 | }; |
---|
697 | } |
---|
698 | } |
---|
699 | |
---|
700 | // Print the quality scores we collected, before ending the file |
---|
701 | printQualScore( outWriter, qualScores, tagLength ); |
---|
702 | qualScores = []; |
---|
703 | |
---|
704 | } catch( Exception e ) { |
---|
705 | log.error( "An error occurred while copying contents from " + inFile.getName() + ": " + e.getMessage() ); |
---|
706 | return false; |
---|
707 | } |
---|
708 | } |
---|
709 | |
---|
710 | /** |
---|
711 | * Prints a sequence to the output file |
---|
712 | * @param outWriter |
---|
713 | * @param sequence |
---|
714 | * @param tag |
---|
715 | */ |
---|
716 | private void printQualScore( BufferedWriter outWriter, List<Integer> qualScores, int tagLength, int maxWidth = 60 ) { |
---|
717 | // If no qualScores are given, also don't prepend it with the tag |
---|
718 | if( qualScores.size() == 0 ) |
---|
719 | return |
---|
720 | |
---|
721 | // Prepend the tag to the sequence |
---|
722 | qualScores = Collections.nCopies( tagLength, 40 ) + qualScores; |
---|
723 | |
---|
724 | // Write the sequence with a width of maxWidth characters per line |
---|
725 | while( qualScores ) { |
---|
726 | if( qualScores.size() > maxWidth ) { |
---|
727 | outWriter.write( qualScores[0..maxWidth-1].join( " " ) ); |
---|
728 | qualScores = qualScores[maxWidth..-1] |
---|
729 | } else { |
---|
730 | outWriter.write( qualScores.join( " " ) ); |
---|
731 | qualScores = null; |
---|
732 | } |
---|
733 | outWriter.newLine(); |
---|
734 | } |
---|
735 | } |
---|
736 | |
---|
737 | } |
---|