Changeset 228


Ignore:
Timestamp:
Sep 25, 2009, 9:21:22 AM (11 years ago)
Author:
jjbot
Message:

Added flusher for the interpro go terms.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • parsers/interpro.py

    r227 r228  
    99from xml.sax import make_parser
    1010from xml.sax.handler import ContentHandler
     11
     12# Numpy imports
     13from numpy import unique
    1114
    1215# Ibidas imports
     
    3235    ITEM_LINK_TYPE_FN   = ['interpro_id', 'key', 'type']
    3336    ITEM_PROP_TERM_FN   = ['interpro_id', 'term', 'type']
    34     SET_FN              = ['name', 'version', 'date']
    3537    NAME_FN             = ['interpro_id', 'name']
    3638    REFERENCE_FN        = ['interpro_id', 'ref_id']
     39    SET_FN              = ['name', 'version', 'date']
    3740
    3841    def __init__(self, filename):
     
    249252    """
    250253    PREREQUISITE_DS = [
    251             'Ibidas Ontology', 'Sequence Ontology',
     254            'Ibidas Ontology',   # FIXME: should be changed to Interpro specific ontology
     255            'Sequence Ontology', # Provides some ontology terms
     256            'Gene Ontology',     # Needed to link agains (XML contains links to GO)
    252257            'Protein modification Ontology'
    253258    ]
     
    362367
    363368    def _flushLinks(self):
     369        """Flushes the links found in the Interpro file. There are two types, relations with
     370        a dataset association and relations with a associated type. These two are joined
     371        together to be flushed as one list."""
    364372        cd = self.cd
    365373        items = self.items[:, ('item_id', 'display_name')]
     
    393401        #return ids %+ links
    394402
     403    def _flushItemTerms(self):
     404        """Flushes item_prop_terms to database."""
     405
     406        # Create some shortcuts to work with
     407        cd = self.cd
     408        ipt = self.interpro_data.item_prop_terms
     409        items = self.items[:, ('item_id', 'display_name')]
     410        terms = self.terms[:, ('term_id', 'interpro_name')]
     411
     412        # Find the GO term identifiers in the db.
     413        go_terms = unique(ipt[:, ('term',)]())
     414        go_terms_db = cd.term[
     415                _.identifier.within(*go_terms.tolist()),
     416                ('term_id', 'identifier')
     417        ].copy()
     418        assert len(go_terms) == len(go_terms_db), \
     419                "Not all required GO terms could be found in the database."
     420
     421        # Get data ready for the database
     422        ipt = ipt.join(items, 'interpro_id', 'display_name') # provides item_id
     423        ipt = ipt[:, ('item_id', 'term', 'type')]
     424        ipt = ipt.join(go_terms_db, 'term', 'identifier') # provides term_id
     425        ipt = ipt[:, ('item_id', 'term_id', 'type')]
     426        ipt = ipt.select(_,
     427                scalar(self.interpro_db_id) / 'source_id', # change to GO id?
     428                scalar(-1) / 'type_id' #FIXME: change to real type
     429        )
     430        cd.item_prop_term.extend(ipt)
     431       
     432
    395433    def flush(self):
    396434        """Converts the parsed data into something that fits into the
     
    404442        self._flushNames()
    405443        self._flushLinks()
    406 
    407         # Flush items, we'll need them later
    408         # items = interpro_data.items
     444        self._flushItemTerms()
    409445
    410446        cd.set.flush()
     
    412448        cd.item_prop_accession.flush()
    413449        cd.item_child_link.flush()
     450        cd.item_prop_term.flush()
    414451        cd.commit()
Note: See TracChangeset for help on using the changeset viewer.