Changeset 11


Ignore:
Timestamp:
Oct 13, 2011, 1:50:52 PM (5 years ago)
Author:
r.w.w.brouwer@…
Message:

Bug solved in trim and added Fastq feature

Location:
trunk/narwhal
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/narwhal/bin/prepare_run.py

    r1 r11  
    109109    return retval
    110110
     111def _read_fastq_sample_sheet( fname ):
     112    '''
     113    Read a sample sheet and return the data
     114
     115    Args:
     116
     117
     118    Returns:
     119
     120
     121    '''
     122    retval = []
     123    f      = open( fname, 'rU' )
     124
     125    for l in f:
     126        l = l.rstrip()
     127        # parse the header and fill the column mapping
     128        if l == '':
     129           pass
     130        elif l[0] == '#':
     131            pass
     132        # parse the file contents
     133        else:
     134            s = l.split( "\t" )  # split the sample-sheet line on whitespace characters
     135            if len(s) >= 5:
     136                d = {
     137                    'id'                    : s[0],
     138                    'files'                 : s[1].split(','),
     139                    'alignment.reference'   : s[2],
     140                    'alignment.paired_end'  : s[3],
     141                    'alignment.application' : s[4],
     142                    'alignment.options'     : [] #s[9].split(',')
     143                }
     144                if len(s) == 6:
     145                    d['alignment.options'] = s[5].split(',')
     146                retval.append( (s[0],d) )
     147            else:
     148                 logging.fatal( "A line in the samplesheet could not be parsed:\n%s " % l )
     149    #
     150    return retval
     151
     152
     153
    111154#
    112155#
     
    496539
    497540    Returns:
    498       the lane and read information
    499     '''
     541      the lane and read information. If these can not be determined
     542      return NA for both
     543    '''
     544    lane = 'NA'
     545    read = 'NA'
    500546    obj = re.match( '.*s_(\d)_(\d)_.*', fname )
    501     if not obj: return None
    502 
    503     lane = obj.group(1)
    504     read = obj.group(2)
    505     return( lane, read )
     547    if obj:
     548        lane = obj.group(1)
     549        read = obj.group(2)
     550
     551    # return the lane and read information
     552    return ( lane, read )
    506553
    507554
     
    747794%s
    748795
    749 Usage: %s -s <samplesheet> <Basecalls directory> <Working directory>
    750 
    751 """ % (message, sys.argv[0])
     796
     797Usage:
     798
     799There are two use cases for this software. In the first use-case reads are obtained directly from the BaseCalls directory of the run folder. When reads are directly obtained from the BaseCalls folder,Narwhal is able to make its own unique run-folder. In this case a
     800working directory is required.
     801
     8021A) python %s -s <sample-sheet> <Basecalls directory> <Working directory>
     803
     804The output folder can also be explicitely set by using the -o option
     805
     8061B) python %s -s <sample-sheet> -o <output directory> <Basecalls directory>
     807
     808Narwhal can also process previously constructed FastQ files. In this case, a differently formatted sample-sheet is required. To run
     809Narwhal in this mode, the "-f flag" should be provided.
     810
     8112) python %s -fs <sample-sheet> -o <output directory>
     812
     813Options:
     814-s/--samplesheet        the samplesheet
     815-o/--outdir             the output directory
     816-f/--fastq-input        use Narwhal in FastQ mode
     817-D/--DRYRUN             perform a DRYRUN only
     818-h/--help               display the help message
     819
     820<Basecalls directory>   the directory containing the Illumina QseQ files
     821<Working directory>     the directory in which Narwhal should create a new run folder
     822
     823
     824""" % (message, sys.argv[0],sys.argv[0],sys.argv[0])
    752825    if error: sys.exit( error )
    753826
     
    755828
    756829if __name__ == '__main__':
     830
     831    FROM_FASTQ  = False
    757832
    758833    args = []
    759834    d_basecalls = None
    760835    d_workdir   = None
     836    d_outdir    = None
    761837    f_samplesh  = None
    762838
     
    770846    #
    771847    try:
    772         opts, args = getopt.getopt( sys.argv[1:], "hs:D", ["help", "samplesheet", "DRYRUN"] )
     848        opts, args = getopt.getopt( sys.argv[1:], "o:fhs:D", ["outdir", "fastq-input", "help", "samplesheet", "DRYRUN"] )
    773849
    774850        for o,a in opts:
     
    779855                # determine whether we are doing a dry run
    780856                DRYRUN = True
    781 
     857            if o in ('-f','--fastq-input'):
     858                FROM_FASTQ = True
     859            if o in ('-o','--outdir'):
     860                d_outdir = os.path.abspath(a)
    782861
    783862    except getopt.GetoptError, err:
     
    794873        if not os.path.exists( d_basecalls ):
    795874            usage( 'basecall directory (%s) does not exist' % d_basecalls, error=2 )
     875    elif FROM_FASTQ :
     876        if not os.path.exists( d_outdir ):
     877            usage( 'Output directory (%s) does not exist' % d_outdir, error=2 )
     878    elif d_outdir != None and len(args) == 1:
     879        d_basecalls = os.path.abspath( args[0] )
     880        if not os.path.exists( d_basecalls ):
     881            usage( 'basecall directory (%s) does not exist' % d_basecalls, error=2 )
    796882    else:
    797883        usage( 'The BaseCalls and WorkDir arguments are not defined', error=2 )
     
    801887    # start the workflow
    802888    #
    803 
    804     # get the new working directory
    805     dn_flowc = _get_flowcell_dirname( d_basecalls )
    806     d_outdir = _outdir( d_workdir, dn_flowc )
    807     logging.info( "Output will be written to %s" % d_outdir )
    808 
    809 
    810     # read the sample sheet
    811     a_samples = _read_sample_sheet( f_samplesh )     
    812     logging.info( "%d Samples parsed from samplesheet %s" %(len(a_samples), f_samplesh) )   
    813 
    814     # create the directory structure
    815     odirs = create_dir_structure( d_outdir, sampleids=[ s[0] for s in a_samples] )
    816     logging.info( "Created %d output directories" % len(odirs) )
    817 
    818     # make the list file that will govern the qseq to fastq conversion
    819     a_qseq  = _get_qseq_paths( d_basecalls )
    820     a_fastq = _qseq_fastq_lst( odirs[0], a_qseq )
    821     logging.info( "Prepared %d QseQ to FastQ conversions" % len(a_fastq) )
     889    if not FROM_FASTQ:
     890        # get the new working directory
     891        if d_outdir == None:
     892            dn_flowc = _get_flowcell_dirname( d_basecalls )
     893            d_outdir = _outdir( d_workdir, dn_flowc )
     894
     895        logging.info( "Output will be written to %s" % d_outdir )
     896
     897
     898        # read the sample sheet
     899        a_samples = _read_sample_sheet( f_samplesh )     
     900        logging.info( "%d Samples parsed from samplesheet %s" %(len(a_samples), f_samplesh) )   
     901
     902        # create the directory structure
     903        odirs = create_dir_structure( d_outdir, sampleids=[ s[0] for s in a_samples] )
     904        logging.info( "Created %d output directories" % len(odirs) )
     905
     906        # make the list file that will govern the qseq to fastq conversion
     907        a_qseq  = _get_qseq_paths( d_basecalls )
     908        a_fastq = _qseq_fastq_lst( odirs[0], a_qseq )
     909        logging.info( "Prepared %d QseQ to FastQ conversions" % len(a_fastq) )
    822910   
    823     # make the list file that will govern the sample demultiplexing
    824     a_sfastq = _process_singleplexes( odirs[1], a_fastq, a_samples )
    825     a_dindex, a_dfastq = _process_multiplexes( odirs[1], a_fastq, a_samples )
    826     logging.info( "Prepared singleplex mergere to %d FastQ files" % len(a_sfastq) )
    827     logging.info( "Prepared demultiplexing and mergere to %d FastQ files from %d index files" % (len(a_dfastq), len(a_dindex)) )   
    828 
    829     #
    830     a_fastq = []
    831     [a_fastq.append( t )  for t in a_sfastq ]
    832     [a_fastq.append( t )  for t in a_dfastq ]
     911        # make the list file that will govern the sample demultiplexing
     912        a_sfastq = _process_singleplexes( odirs[1], a_fastq, a_samples )
     913        a_dindex, a_dfastq = _process_multiplexes( odirs[1], a_fastq, a_samples )
     914        logging.info( "Prepared singleplex mergere to %d FastQ files" % len(a_sfastq) )
     915        logging.info( "Prepared demultiplexing and mergere to %d FastQ files from %d index files" % (len(a_dfastq), len(a_dindex)) )   
     916     
     917        #
     918        a_fastq = []
     919        [a_fastq.append( t )  for t in a_sfastq ]
     920        [a_fastq.append( t )  for t in a_dfastq ]
     921    else :
     922        #
     923        logging.info( "Running NARWHAL in FastQ mode" )
     924        logging.info( "Output will be written to %s" % d_outdir )
     925        a_samples = _read_fastq_samplesheet( f_samplesh )
     926        odirs     = create_dir_structure( d_outdir, sampleids=[ s[0] for s in a_samples] )
     927
     928        # prepare the FastQ files
     929        a_fastq = []
     930        for s in a_samples:
     931            [ a_fastq.append( (s[0], f) ) for f in s[1] ]
    833932   
    834933    # make the options files for the alignments
     
    842941    logging.info( "Finished %s" % sys.argv[0] )
    843942
    844     #[sys.stdout.write( "%s\n" % str(x)) for x in a_bamf]
    845     #[sys.stdout.write( "%s\n" % str(x)) for x in a_pdf]
    846 
    847943
    848944    #
  • trunk/narwhal/lib/python/ngs/util/trim.py

    r2 r11  
    66
    77'''
     8
     9import logging
    810
    911#
     
    3335    # set the full sequence as the return value
    3436    ret = pre + seq
    35    
     37    logging.debug( "Processing sequence %s with pre-sequence %s " % (seq, pre) )     
     38
    3639    # determine whether the seed is present in the sequence
    3740    i = seq.find( seed )
     41    logging.debug( "Seed position is character %d" % i )
    3842    if i != -1 or seq != '':
    3943
     
    4145        # whether the remaining string is present
    4246        mseq = seq[i:]
    43         if len( query ) > len(mseq) :
     47        logging.debug( "Sequence to match is %s" % mseq )
     48
     49        if len( query ) >= len(mseq) :
    4450            squery = query[ 0:len(mseq) ]
     51            logging.debug( "The query is longer than the sequence to match: matching to %s" % squery )
    4552        else :
    4653            mseq   = seq[ i:len(query) ]
    4754            squery = query
    48                
    49         # test whether the match is found
    50         if mseq == squery :
    51             # we found a match
    52             ret = pre + seq[0:i]
    53         else :
    54             # recursively search for more matches
    55             # but first go over the seed and proceed
    56             #
    57             co  = i + len(seed)
    58             if co > len(seq): co = len(seq)
    59             tch = seq[ co: ]
    60             ret = seeded_trim( tch, query, seed, pre=pre+seq[0:co] )
    61                    
     55            logging.debug( "The match sequence is shorter than the query; shortening to %s" % mseq )
     56
     57        logging.debug( "Matching %s to %s " % (squery, mseq)  )
     58
     59        if len(mseq) > len(seed):
     60           
     61            # test whether the match is found
     62            if mseq == squery :
     63                # we found a match
     64                ret = pre + seq[0:i]
     65                logging.debug( "We found a match: returning %s" % ret )
     66            else :
     67                # recursively search for more matches
     68                # but first go over the seed and proceed
     69                #
     70                co  = i + len(seed)
     71                if co > len(seq): co = len(seq)
     72                tch = seq[ co: ]
     73                logging.debug( "No match found repeating cycle with sequence from %d ( %s )" % (co, tch) )
     74                ret = seeded_trim( tch, query, seed, pre=pre+seq[0:(co)] )
     75        else:
     76            logging.debug( "The sequence to match is shorter than the seed-length" )
    6277    # return the sub sequence
    6378    return ret
     
    146161if __name__ == '__main__':
    147162    print "Tests not yet implemented"
    148    
    149    
     163    logging.basicConfig( level=logging.DEBUG )
     164   
     165    seqa  = "TGGGGGGGAGTATGCGGCCCGCACTGAGCTGCGCAT"
     166    seqb  = "TGGGGGGGAGTATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC"
     167    seqc  = "TGGGGGGGAGTATTGGAATTCTCGGGTGCCAAGGAA"
     168    seqd  = "TGGGGGTGGATATTGGAATTCTCGGGTGCCAAGGAA"
     169    seqe  = "TGGGGGTGGAAGGAGTATTGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC"
     170
     171    query = "TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC"
     172    seed  = query[0:5]
     173   
     174
     175    logging.debug( "Query is %s" % query )
     176    logging.debug( "Seed is %s" % seed )
     177    logging.debug( "" )
     178    logging.debug( "-NON Matching case-----------------")
     179    logging.debug( "Sequence is %s" % seqa )
     180    logging.debug( "Result is %s" % seeded_trim( seqa, query, seed ) )
     181    logging.debug( "" )
     182    logging.debug( "-Full Matching case-----------------" )
     183    logging.debug( "Sequence is %s" % seqb )
     184    logging.debug( "Result is %s" % seeded_trim( seqb, query, seed ) )
     185    logging.debug( "" )
     186    logging.debug( "-Partial Matching case-----------------" )
     187    logging.debug( "Sequence is %s" % seqc )
     188    logging.debug( "Result is %s" % seeded_trim( seqc, query, seed ) )
     189    logging.debug( "" )
     190    logging.debug( "-Multiple seed partial match case-----------------" )
     191    logging.debug( "Sequence is %s" % seqd )
     192    logging.debug( "Result is %s" % seeded_trim( seqd, query, seed ) )
     193    logging.debug( "" )
     194    logging.debug( "-Multiple seed full match case-----------------" )
     195    logging.debug( "Sequence is %s" % seqe )
     196    logging.debug( "Result is %s" % seeded_trim( seqe, query, seed ) )
     197
     198
     199
     200   
     201   
Note: See TracChangeset for help on using the changeset viewer.