Changeset 11

Show
Ignore:
Timestamp:
13-10-11 13:50:52 (3 years ago)
Author:
r.w.w.brouwer@…
Message:

Bug solved in trim and added Fastq feature

Location:
trunk/narwhal
Files:
2 modified

Legend:

Unmodified
Added
Removed
  • trunk/narwhal/bin/prepare_run.py

    r1 r11  
    109109    return retval  
    110110 
     111def _read_fastq_sample_sheet( fname ): 
     112    ''' 
     113    Read a sample sheet and return the data 
     114 
     115    Args: 
     116 
     117 
     118    Returns: 
     119 
     120 
     121    ''' 
     122    retval = [] 
     123    f      = open( fname, 'rU' ) 
     124 
     125    for l in f: 
     126        l = l.rstrip() 
     127        # parse the header and fill the column mapping 
     128        if l == '': 
     129           pass 
     130        elif l[0] == '#': 
     131            pass 
     132        # parse the file contents 
     133        else: 
     134            s = l.split( "\t" )  # split the sample-sheet line on whitespace characters 
     135            if len(s) >= 5: 
     136                d = { 
     137                    'id'                    : s[0], 
     138                    'files'                 : s[1].split(','), 
     139                    'alignment.reference'   : s[2], 
     140                    'alignment.paired_end'  : s[3], 
     141                    'alignment.application' : s[4], 
     142                    'alignment.options'     : [] #s[9].split(',') 
     143                } 
     144                if len(s) == 6: 
     145                    d['alignment.options'] = s[5].split(',') 
     146                retval.append( (s[0],d) ) 
     147            else: 
     148                 logging.fatal( "A line in the samplesheet could not be parsed:\n%s " % l ) 
     149    # 
     150    return retval 
     151 
     152 
     153 
    111154# 
    112155# 
     
    496539 
    497540    Returns: 
    498       the lane and read information 
    499     ''' 
     541      the lane and read information. If these can not be determined  
     542      return NA for both 
     543    ''' 
     544    lane = 'NA' 
     545    read = 'NA' 
    500546    obj = re.match( '.*s_(\d)_(\d)_.*', fname ) 
    501     if not obj: return None 
    502  
    503     lane = obj.group(1) 
    504     read = obj.group(2)  
    505     return( lane, read ) 
     547    if obj: 
     548        lane = obj.group(1) 
     549        read = obj.group(2)  
     550 
     551    # return the lane and read information 
     552    return ( lane, read ) 
    506553 
    507554 
     
    747794%s 
    748795 
    749 Usage: %s -s <samplesheet> <Basecalls directory> <Working directory> 
    750  
    751 """ % (message, sys.argv[0]) 
     796 
     797Usage: 
     798 
     799There are two use cases for this software. In the first use-case reads are obtained directly from the BaseCalls directory of the run folder. When reads are directly obtained from the BaseCalls folder,Narwhal is able to make its own unique run-folder. In this case a 
     800working directory is required. 
     801 
     8021A) python %s -s <sample-sheet> <Basecalls directory> <Working directory> 
     803 
     804The output folder can also be explicitely set by using the -o option  
     805 
     8061B) python %s -s <sample-sheet> -o <output directory> <Basecalls directory>  
     807 
     808Narwhal can also process previously constructed FastQ files. In this case, a differently formatted sample-sheet is required. To run 
     809Narwhal in this mode, the "-f flag" should be provided. 
     810 
     8112) python %s -fs <sample-sheet> -o <output directory> 
     812 
     813Options: 
     814-s/--samplesheet        the samplesheet 
     815-o/--outdir             the output directory 
     816-f/--fastq-input        use Narwhal in FastQ mode 
     817-D/--DRYRUN             perform a DRYRUN only 
     818-h/--help               display the help message 
     819 
     820<Basecalls directory>   the directory containing the Illumina QseQ files 
     821<Working directory>     the directory in which Narwhal should create a new run folder 
     822 
     823 
     824""" % (message, sys.argv[0],sys.argv[0],sys.argv[0]) 
    752825    if error: sys.exit( error ) 
    753826 
     
    755828 
    756829if __name__ == '__main__': 
     830 
     831    FROM_FASTQ  = False  
    757832 
    758833    args = [] 
    759834    d_basecalls = None  
    760835    d_workdir   = None 
     836    d_outdir    = None 
    761837    f_samplesh  = None 
    762838 
     
    770846    # 
    771847    try: 
    772         opts, args = getopt.getopt( sys.argv[1:], "hs:D", ["help", "samplesheet", "DRYRUN"] ) 
     848        opts, args = getopt.getopt( sys.argv[1:], "o:fhs:D", ["outdir", "fastq-input", "help", "samplesheet", "DRYRUN"] ) 
    773849 
    774850        for o,a in opts: 
     
    779855                # determine whether we are doing a dry run 
    780856                DRYRUN = True 
    781  
     857            if o in ('-f','--fastq-input'): 
     858                FROM_FASTQ = True 
     859            if o in ('-o','--outdir'): 
     860                d_outdir = os.path.abspath(a) 
    782861 
    783862    except getopt.GetoptError, err: 
     
    794873        if not os.path.exists( d_basecalls ): 
    795874            usage( 'basecall directory (%s) does not exist' % d_basecalls, error=2 ) 
     875    elif FROM_FASTQ : 
     876        if not os.path.exists( d_outdir ): 
     877            usage( 'Output directory (%s) does not exist' % d_outdir, error=2 ) 
     878    elif d_outdir != None and len(args) == 1: 
     879        d_basecalls = os.path.abspath( args[0] ) 
     880        if not os.path.exists( d_basecalls ): 
     881            usage( 'basecall directory (%s) does not exist' % d_basecalls, error=2 ) 
    796882    else: 
    797883        usage( 'The BaseCalls and WorkDir arguments are not defined', error=2 ) 
     
    801887    # start the workflow 
    802888    #  
    803  
    804     # get the new working directory 
    805     dn_flowc = _get_flowcell_dirname( d_basecalls ) 
    806     d_outdir = _outdir( d_workdir, dn_flowc ) 
    807     logging.info( "Output will be written to %s" % d_outdir ) 
    808  
    809  
    810     # read the sample sheet 
    811     a_samples = _read_sample_sheet( f_samplesh )      
    812     logging.info( "%d Samples parsed from samplesheet %s" %(len(a_samples), f_samplesh) )     
    813  
    814     # create the directory structure 
    815     odirs = create_dir_structure( d_outdir, sampleids=[ s[0] for s in a_samples] )  
    816     logging.info( "Created %d output directories" % len(odirs) ) 
    817  
    818     # make the list file that will govern the qseq to fastq conversion  
    819     a_qseq  = _get_qseq_paths( d_basecalls ) 
    820     a_fastq = _qseq_fastq_lst( odirs[0], a_qseq ) 
    821     logging.info( "Prepared %d QseQ to FastQ conversions" % len(a_fastq) ) 
     889    if not FROM_FASTQ: 
     890        # get the new working directory 
     891        if d_outdir == None: 
     892            dn_flowc = _get_flowcell_dirname( d_basecalls ) 
     893            d_outdir = _outdir( d_workdir, dn_flowc ) 
     894 
     895        logging.info( "Output will be written to %s" % d_outdir ) 
     896 
     897 
     898        # read the sample sheet 
     899        a_samples = _read_sample_sheet( f_samplesh )      
     900        logging.info( "%d Samples parsed from samplesheet %s" %(len(a_samples), f_samplesh) )     
     901 
     902        # create the directory structure 
     903        odirs = create_dir_structure( d_outdir, sampleids=[ s[0] for s in a_samples] )  
     904        logging.info( "Created %d output directories" % len(odirs) ) 
     905 
     906        # make the list file that will govern the qseq to fastq conversion  
     907        a_qseq  = _get_qseq_paths( d_basecalls ) 
     908        a_fastq = _qseq_fastq_lst( odirs[0], a_qseq ) 
     909        logging.info( "Prepared %d QseQ to FastQ conversions" % len(a_fastq) ) 
    822910     
    823     # make the list file that will govern the sample demultiplexing 
    824     a_sfastq = _process_singleplexes( odirs[1], a_fastq, a_samples ) 
    825     a_dindex, a_dfastq = _process_multiplexes( odirs[1], a_fastq, a_samples ) 
    826     logging.info( "Prepared singleplex mergere to %d FastQ files" % len(a_sfastq) ) 
    827     logging.info( "Prepared demultiplexing and mergere to %d FastQ files from %d index files" % (len(a_dfastq), len(a_dindex)) )     
    828  
    829     #  
    830     a_fastq = [] 
    831     [a_fastq.append( t )  for t in a_sfastq ] 
    832     [a_fastq.append( t )  for t in a_dfastq ] 
     911        # make the list file that will govern the sample demultiplexing 
     912        a_sfastq = _process_singleplexes( odirs[1], a_fastq, a_samples ) 
     913        a_dindex, a_dfastq = _process_multiplexes( odirs[1], a_fastq, a_samples ) 
     914        logging.info( "Prepared singleplex mergere to %d FastQ files" % len(a_sfastq) ) 
     915        logging.info( "Prepared demultiplexing and mergere to %d FastQ files from %d index files" % (len(a_dfastq), len(a_dindex)) )     
     916      
     917        #  
     918        a_fastq = [] 
     919        [a_fastq.append( t )  for t in a_sfastq ] 
     920        [a_fastq.append( t )  for t in a_dfastq ] 
     921    else : 
     922        #  
     923        logging.info( "Running NARWHAL in FastQ mode" ) 
     924        logging.info( "Output will be written to %s" % d_outdir ) 
     925        a_samples = _read_fastq_samplesheet( f_samplesh )  
     926        odirs     = create_dir_structure( d_outdir, sampleids=[ s[0] for s in a_samples] )  
     927 
     928        # prepare the FastQ files 
     929        a_fastq = [] 
     930        for s in a_samples: 
     931            [ a_fastq.append( (s[0], f) ) for f in s[1] ] 
    833932    
    834933    # make the options files for the alignments 
     
    842941    logging.info( "Finished %s" % sys.argv[0] ) 
    843942 
    844     #[sys.stdout.write( "%s\n" % str(x)) for x in a_bamf] 
    845     #[sys.stdout.write( "%s\n" % str(x)) for x in a_pdf] 
    846  
    847943 
    848944    #  
  • trunk/narwhal/lib/python/ngs/util/trim.py

    r2 r11  
    66 
    77''' 
     8 
     9import logging 
    810 
    911# 
     
    3335    # set the full sequence as the return value 
    3436    ret = pre + seq 
    35      
     37    logging.debug( "Processing sequence %s with pre-sequence %s " % (seq, pre) )      
     38 
    3639    # determine whether the seed is present in the sequence 
    3740    i = seq.find( seed ) 
     41    logging.debug( "Seed position is character %d" % i ) 
    3842    if i != -1 or seq != '': 
    3943 
     
    4145        # whether the remaining string is present  
    4246        mseq = seq[i:] 
    43         if len( query ) > len(mseq) : 
     47        logging.debug( "Sequence to match is %s" % mseq ) 
     48 
     49        if len( query ) >= len(mseq) : 
    4450            squery = query[ 0:len(mseq) ] 
     51            logging.debug( "The query is longer than the sequence to match: matching to %s" % squery ) 
    4552        else : 
    4653            mseq   = seq[ i:len(query) ] 
    4754            squery = query 
    48                  
    49         # test whether the match is found 
    50         if mseq == squery : 
    51             # we found a match  
    52             ret = pre + seq[0:i] 
    53         else : 
    54             # recursively search for more matches 
    55             # but first go over the seed and proceed 
    56             #  
    57             co  = i + len(seed) 
    58             if co > len(seq): co = len(seq) 
    59             tch = seq[ co: ] 
    60             ret = seeded_trim( tch, query, seed, pre=pre+seq[0:co] ) 
    61                      
     55            logging.debug( "The match sequence is shorter than the query; shortening to %s" % mseq )  
     56 
     57        logging.debug( "Matching %s to %s " % (squery, mseq)  )  
     58 
     59        if len(mseq) > len(seed): 
     60             
     61            # test whether the match is found 
     62            if mseq == squery : 
     63                # we found a match  
     64                ret = pre + seq[0:i] 
     65                logging.debug( "We found a match: returning %s" % ret ) 
     66            else : 
     67                # recursively search for more matches 
     68                # but first go over the seed and proceed 
     69                #  
     70                co  = i + len(seed)  
     71                if co > len(seq): co = len(seq) 
     72                tch = seq[ co: ] 
     73                logging.debug( "No match found repeating cycle with sequence from %d ( %s )" % (co, tch) ) 
     74                ret = seeded_trim( tch, query, seed, pre=pre+seq[0:(co)] ) 
     75        else: 
     76            logging.debug( "The sequence to match is shorter than the seed-length" ) 
    6277    # return the sub sequence 
    6378    return ret  
     
    146161if __name__ == '__main__': 
    147162    print "Tests not yet implemented" 
    148      
    149      
     163    logging.basicConfig( level=logging.DEBUG ) 
     164     
     165    seqa  = "TGGGGGGGAGTATGCGGCCCGCACTGAGCTGCGCAT" 
     166    seqb  = "TGGGGGGGAGTATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC" 
     167    seqc  = "TGGGGGGGAGTATTGGAATTCTCGGGTGCCAAGGAA" 
     168    seqd  = "TGGGGGTGGATATTGGAATTCTCGGGTGCCAAGGAA" 
     169    seqe  = "TGGGGGTGGAAGGAGTATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC"  
     170 
     171    query = "TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC" 
     172    seed  = query[0:5] 
     173     
     174 
     175    logging.debug( "Query is %s" % query ) 
     176    logging.debug( "Seed is %s" % seed ) 
     177    logging.debug( "" ) 
     178    logging.debug( "-NON Matching case-----------------") 
     179    logging.debug( "Sequence is %s" % seqa ) 
     180    logging.debug( "Result is %s" % seeded_trim( seqa, query, seed ) ) 
     181    logging.debug( "" ) 
     182    logging.debug( "-Full Matching case-----------------" ) 
     183    logging.debug( "Sequence is %s" % seqb ) 
     184    logging.debug( "Result is %s" % seeded_trim( seqb, query, seed ) ) 
     185    logging.debug( "" ) 
     186    logging.debug( "-Partial Matching case-----------------" ) 
     187    logging.debug( "Sequence is %s" % seqc ) 
     188    logging.debug( "Result is %s" % seeded_trim( seqc, query, seed ) ) 
     189    logging.debug( "" ) 
     190    logging.debug( "-Multiple seed partial match case-----------------" ) 
     191    logging.debug( "Sequence is %s" % seqd ) 
     192    logging.debug( "Result is %s" % seeded_trim( seqd, query, seed ) ) 
     193    logging.debug( "" ) 
     194    logging.debug( "-Multiple seed full match case-----------------" ) 
     195    logging.debug( "Sequence is %s" % seqe ) 
     196    logging.debug( "Result is %s" % seeded_trim( seqe, query, seed ) ) 
     197 
     198 
     199 
     200     
     201