Changeset 227


Ignore:
Timestamp:
Sep 24, 2009, 5:13:30 PM (11 years ago)
Author:
jjbot
Message:

Added flush functionality to interpro, fixed a nasty bug with Marc. Extended Ibidas Ontology to facilitate flushing Interpro.

Files:
6 edited

Legend:

Unmodified
Added
Removed
  • container/opcon.py

    r203 r227  
    115115            self._fields = left._fields + rfields
    116116            lenleft = len(left._fields)
     117
    117118            self._actidx =  left._actidx + [i + lenleft for i in right._actidx]
    118            
    119119
    120120            self._segments = (SegmentSet(0,left_field_ids),\
  • container/qg_transform.py

    r218 r227  
    809809                #check if strictly ordered (i.e. 1,2,3,4 is strictly orderded,
    810810                #, 1,3,4 is not)
    811                 elif(all([(r - l) == 1 for (l,r) in zip(res[:-1],res[1:])])):
     811                elif(len(res) > 0 and all([(r - l) == 1 for (l,r) in zip(res[:-1],res[1:])])):
    812812                    #if length equal to out fields, no trans is needed
    813813                    if(len(res) == len(out_field_idx)):
  • container/qg_translate.py

    r220 r227  
    405405                    data[outpos] = res
    406406            result.data = tuple(data[:outsize])
     407            result.ncol = outsize
    407408        return result
    408409
     
    480481            nnrow = lr.nrow
    481482
    482         return QueryResult(data=lr_data + rr_data,ncol=lr.ncol+rr.ncol,nrow=nnrow)
    483    
     483        res = QueryResult(data=lr_data + rr_data,ncol=lr.ncol+rr.ncol,nrow=nnrow)
     484        return res
    484485
    485486    def funcSetOpCon(self,node,sources):
  • data/ibidas.obo

    r215 r227  
    6565def: "Gene belongs to geneset."
    6666
     67[Typedef]
     68id: active_site
     69name: active_site
     70def: "active_site"
     71
     72[Typedef]
     73id: domain
     74name: domain
     75def: "domain"
     76
     77[Typedef]
     78id: family
     79name: family
     80def: "family"
     81
     82[Typedef]
     83id: ptm
     84name: ptm
     85def: "ptm"
     86
     87[Typedef]
     88id: child
     89name: child
     90def: "child"
     91
     92[Typedef]
     93id: found_in
     94name: found_in
     95def: "found_in"
  • parsers/interpro.py

    r225 r227  
    1515from parsers import flush_utils
    1616
    17 class InterProParser():
     17class InterproParser():
    1818    """
    1919    Parser for Interpro XML files. Uses SAX under the hood.
     
    2727
    2828    # Fieldnames for the created tables
    29     ACCESSION_FN = ['interpro_id', 'ext_db_id', 'ext_db', 'ext_db_name']
    30     ITEM_FN = ['interpro_id', 'type', 'name']
    31     ITEM_LINK_DB_FN = ['interpro_id', 'key', 'database']
    32     ITEM_LINK_TYPE_FN = ['interpro_id', 'key', 'type']
    33     ITEM_PROP_TERM_FN = ['interpro_id', 'term', 'type']
    34     SET_FN = ['name', 'version', 'date']
    35     NAME_FN = ['interpro_id', 'name']
    36     REFERENCE_FN = ['interpro_id', 'ref_id']
     29    ACCESSION_FN        = ['interpro_id', 'ext_db_id', 'ext_db', 'ext_db_name']
     30    ITEM_FN             = ['interpro_id', 'type', 'name']
     31    ITEM_LINK_DB_FN     = ['interpro_id', 'key', 'database']
     32    ITEM_LINK_TYPE_FN   = ['interpro_id', 'key', 'type']
     33    ITEM_PROP_TERM_FN   = ['interpro_id', 'term', 'type']
     34    SET_FN              = ['name', 'version', 'date']
     35    NAME_FN             = ['interpro_id', 'name']
     36    REFERENCE_FN        = ['interpro_id', 'ref_id']
    3737
    3838    def __init__(self, filename):
     
    4545        handle = open(self.filename, 'rU')
    4646        p = make_parser()
    47         interpro_handler = InterProHandler()
     47        interpro_handler = InterproHandler()
    4848        p.setContentHandler(interpro_handler)
    4949        p.parse(self.filename)
     
    5959
    6060
    61 class InterProHandler(ContentHandler):
     61class InterproHandler(ContentHandler):
    6262    def __init__(self):
    6363        self.logger = IbidasLogger.getIbidasLogger()
    6464        self.logger.setLevel(logging.ERROR)
    65         self.logger.debug('Created InterProHandler')
     65        self.logger.debug('Created InterproHandler')
    6666
    6767        self.name = ""
     
    157157            if self.PARENT_MODE: # parents of current entry
    158158                self.item_links_type.append(
    159                         ( attr['ipr_ref'], self.cur_int_id, InterProParser.CHILD)
     159                        ( attr['ipr_ref'], self.cur_int_id, InterproParser.CHILD)
    160160                )
    161161            elif self.CHILD_MODE: # children of current entry
    162162                self.item_links_type.append(
    163                         (self.cur_int_id, attr['ipr_ref'], InterProParser.CHILD)
     163                        (self.cur_int_id, attr['ipr_ref'], InterproParser.CHILD)
    164164                )
    165165            elif self.CONTAINS_MODE: # Interpro entries contained in this entry
    166166                self.item_links_type.append(
    167                         (self.cur_int_id, attr['ipr_ref'], InterProParser.CONTAINS)
     167                        (self.cur_int_id, attr['ipr_ref'], InterproParser.CONTAINS)
    168168                )
    169169            elif self.FOUNDIN_MODE: # Interpro entries having this entry
    170170                self.item_links_type.append(
    171                         (self.cur_int_id, attr['ipr_ref'], InterProParser.FOUND_IN)
     171                        (self.cur_int_id, attr['ipr_ref'], InterproParser.FOUND_IN)
    172172                )
    173173            else: # Found something we don't know what to do with
     
    245245
    246246
    247 class InterProFlusher:
    248     """Flusher class for data parsed by the InterProParser.
     247class InterproFlusher:
     248    """Flusher class for data parsed by the InterproParser.
    249249    """
    250250    PREREQUISITE_DS = [
     
    253253    ]
    254254    PREREQUISITE_TERMS = {
    255         'Active_site': 'active_site', # Ibidas ontology
    256         'Binding_site': 'SO:0000409', # Sequence ontology
    257         'child': 'child', # Ibidas Ontology
     255        'Active_site'   : 'active_site',# Ibidas ontology
     256        'Binding_site'  : 'SO:0000409', # Sequence ontology
     257        'child'         : 'child',      # Ibidas Ontology
    258258        'Conserved_site': 'SO:0000330', # Sequence ontology, non exact match
    259         'contains': 'contains', # Protein modification ontology
    260         'Domain': 'domain', # Ibidas ontology
    261         'Family': 'family', # Ibidas ontology
    262         'found_in': 'found_in', # Ibidas ontology
    263         'PTM': 'ptm', # Ibidas ontology
    264         'Region': 'SO:0000001', # Sequence ontology
    265         'Repeat': 'SO:0000657', # Sequence ontology, non exact match
     259        'contains'      : 'contains',  # Protein modification ontology
     260        'Domain'        : 'domain',    # Ibidas ontology
     261        'Family'        : 'family',    # Ibidas ontology
     262        'found_in'      : 'found_in',  # Ibidas ontology
     263        'PTM'           : 'ptm',        # Ibidas ontology
     264        'Region'        : 'SO:0000001', # Sequence ontology
     265        'Repeat'        : 'SO:0000657', # Sequence ontology, non exact match
    266266    }
    267267
    268     def __init__(self, cd):
     268    def __init__(self, cd, interpro_data):
    269269        self.cd = cd
     270        self.interpro_data = interpro_data
     271        self.interpro_db_id = None
     272        self.terms = None
     273        self.sets = None
    270274
    271275    def _checkPrerequisites(self):
     
    278282                _.name.within(*self.PREREQUISITE_DS), ('set_id', 'name')
    279283        ]
    280         assert len(set_ids) == len(self.PREREQUISITE_DS),
     284        assert len(set_ids) == len(self.PREREQUISITE_DS), \
    281285                'One or more needed datasets not found in the database.'
    282286        self.set_ids = set_ids
     
    287291        cd = self.cd
    288292        pterms = []
    289         for key, value in self.PREREQUISITE_TERMS:
     293        pnames = []
     294        for (key, value) in self.PREREQUISITE_TERMS.items():
     295            pnames.append(key)
    290296            pterms.append(value)
    291297        terms = cd.term[_.identifier.within(*pterms), ('term_id', 'identifier')]
    292         assert len(terms) == len(self.PREREQUISITE_TERMS.keys()),
     298        assert len(terms) == len(self.PREREQUISITE_TERMS.keys()), \
    293299                'One or more needed terms not found in the database.'
    294 
    295 
    296     def flush(self, interpro_data):
    297         """Converts the parsed data into something that fits into the
    298         database."""
    299         cd = self.cd
    300 
     300        names = table(pnames, ['interpro_name'])
     301        self.terms = (terms %+ names).copy()
     302        return terms
     303
     304    def _flushInterproDS(self):
     305        """Creates the Interpro entry in the set table."""
    301306        # Add Interpro dataset to database, we need that first
    302307        # version info not available
    303308        # FIXME: add description
    304         interpro_db_id = flush_utils.resolveDataset(cd, 'Interpro', '?')
     309        cd = self.cd
     310        id = flush_utils.resolveDataset(cd, 'Interpro', '?')
     311        self.interpro_db_id = id
     312        return id
     313
     314    def _flushDatasets(self):
     315        """Flushes the datasets contained in the Interpro file."""
     316        cd = self.cd
     317        interpro_data = self.interpro_data
    305318        ds_type_id = flush_utils.getIbidasDatasetTypeID(cd)
    306319        sets = interpro_data.sets
    307320        sets = sets.select(_,
    308                 scalar(interpro_db_id) / "source_id",
     321                scalar(self.interpro_db_id) / "source_id",
    309322                scalar(ds_type_id) / "type_id"
    310323        )
    311324        set_ids = cd.set.extend(sets)
    312         sets = set_ids %+ sets
     325        self.sets = set_ids %+ sets
     326
     327    def _flushItems(self):
     328        """Inserts the items and provided accessions into the database."""
     329        cd = self.cd
     330        # Flush items themselves
     331        items = self.interpro_data.items
     332        items = items.join(self.terms, 'type', 'interpro_name')
     333        db_items = items[:, ('term_id', 'interpro_id', 'name')]
     334        db_items = db_items.select(_, scalar(self.interpro_db_id) / 'source_id')
     335        db_items = db_items.fname(interpro_id='display_name', term_id='type_id')
     336        db_item_ids = cd.item.extend(db_items)
     337        db_items = db_item_ids %+ db_items
     338        self.items = db_items
     339
     340        # Flush accessions to items
     341        # First the identifiers themselves (also used as display_name)
     342        id_acs = db_items[:, ('item_id', 'display_name', 'source_id')]
     343        id_acs = db_items.select(_, scalar(-1) / 'type_id') # FIXME: should be correct ontology term!
     344        id_acs = db_items.fname(display_name='value')
     345        cd.item_prop_accession.extend(id_acs)
     346
     347        # Then the name accessions
     348        name_acs = db_items[:, ('item_id', 'name', 'source_id')]
     349        name_acs = name_acs.select(_, scalar(-1) / 'type_id') # FIXME: should be correct ontology term!
     350        name_acs = name_acs.fname(name='value')
     351        cd.item_prop_accession.extend(name_acs)
     352
     353    def _flushNames(self):
     354        """Insert the names found in the Interpro XML into the database."""
     355        cd = self.cd
     356        names = self.interpro_data.names
     357        name_acs = names.join(self.items, 'interpro_id', 'display_name')
     358        name_acs = name_acs[:, ('item_id', 'source_id')]
     359        name_acs = name_acs.select(_, scalar(-1) / 'type_id') # FIXME: should be correct ontology term!
     360        name_acs.fname(name='value')
     361        cd.item_prop_accession.extend(name_acs)
     362
     363    def _flushLinks(self):
     364        cd = self.cd
     365        items = self.items[:, ('item_id', 'display_name')]
     366        terms = self.terms[:, ('term_id', 'interpro_name')]
     367
     368        links_db = self.interpro_data.item_links_db
     369        links_db = links_db.join(self.sets, 'database', 'name')
     370        links_db = links_db[:, ('interpro_id', 'key', 'source_id')]
     371        links_db = links_db.join(items, 'key', 'display_name')
     372        links_db = links_db.fname(item_id='child_id')
     373        links_db = links_db[:, ('interpro_id', 'source_id', 'child_id')]
     374        links_db = links_db.join(items, 'interpro_id', 'display_name')
     375        links_db = links_db.fname(item_id='parent_id')
     376        links_db = links_db.select(_, scalar(-1) / 'type_id') # FIXME: should be correct ontology term!
     377        links_db = links_db[:, ('type_id', 'source_id', 'parent_id', 'child_id')]
     378
     379        links_type = self.interpro_data.item_links_type
     380        links_type = links_type.join(terms, 'type', 'interpro_name')
     381        links_type = links_type[:, ('interpro_id', 'key', 'term_id')]
     382        links_type = links_type.join(items, 'interpro_id', 'display_name')
     383        links_type = links_type.fname(item_id='parent_id')
     384        links_type = links_type[:, ('term_id', 'parent_id', 'key')]
     385        links_type = links_type.join(items, 'key', 'display_name')
     386        links_type = links_type.fname(term_id='type_id', item_id='child_id')
     387        links_type = links_type.select(_, scalar(self.interpro_db_id) / 'source_id')
     388        links_type = links_type[:, ('type_id', 'source_id', 'parent_id', 'child_id')]
     389
     390        #links = links_db ++ links_type # stack the two containers to flush simultaneously
     391        ids_db = cd.item_child_link.extend(links_db)
     392        ids_type = cd.item_child_link.extend(links_type)
     393        #return ids %+ links
     394
     395    def flush(self):
     396        """Converts the parsed data into something that fits into the
     397        database."""
     398        cd = self.cd
     399        self._checkPrerequisites()
     400        self._getRequiredTerms()
     401        self._flushInterproDS()
     402        self._flushDatasets()
     403        self._flushItems()
     404        self._flushNames()
     405        self._flushLinks()
    313406
    314407        # Flush items, we'll need them later
    315         items = interpro_data.items
     408        # items = interpro_data.items
    316409
    317410        cd.set.flush()
     411        cd.item.flush()
     412        cd.item_prop_accession.flush()
     413        cd.item_child_link.flush()
    318414        cd.commit()
  • run_imports_jan.py

    r215 r227  
    5858    psi_mi_file = ontology_dir + "psi-mi.obo"
    5959    intact_file = ontology_dir + "intact.obo"
     60    pro_file = ontology_dir + "pro.obo"
     61    psi_mod_file = ontology_dir + "PSI-MOD.obo"
     62    sbo_file = ontology_dir + "SBO_OBO.obo"
    6063    brenda_file = ontology_dir + "BrendaTissue.obo"
    6164    human_disease_file = ontology_dir + "human_disease.obo"
     
    6871        ("PSI-MI", psi_mi_file),
    6972        ("Intact", intact_file),
     73        ("Protein Ontology", pro_file),
     74        ("Protein modification Ontology", psi_mod_file),
     75        #("Systems Biology Ontology", sbo_file),
    7076        ("Brenda tissue Ontology", brenda_file),
    7177        ("Human disease Ontology", human_disease_file),
     
    99105                config['database.user'], config['database.password']).getConSet()
    100106   
    101     #import_ontologies()
     107    import_ontologies()
    102108    #import_genenames()
    103109    #import_i2d()
    104     import_msigdb()
     110    #import_msigdb()
    105111    #import_uniprot_synonyms()
Note: See TracChangeset for help on using the changeset viewer.