source: trunk/medline-importer/get-data @ 2

Last change on this file since 2 was 2, checked in by rob.hooft@…, 7 years ago

added source code and subproject directory structure

  • Property svn:executable set to *
File size: 1.2 KB
Line 
1#!/usr/bin/python
2import os
3import re
4import sys
5if os.getuid() != 998:
6        print >> sys.stderr, "Script must be run as med-import, not as %s" % os.getuid()
7        print >> sys.stderr, "run as sudo -u med-import %s" % sys.argv[0]
8        sys.exit(1)
9# Figure out from the last entry in the database what the last read file is
10f = os.popen("echo 'select xml_file_name from medline_citation order by pmid desc limit 1;' | mysql --user=med-import --password='h7X@Wz8WCkrrqX&ZKtkV' medline2013", "r")
11column = f.readline()
12lastfile = f.readline()
13lastfile = lastfile.strip()
14print "Last imported file is ", lastfile
15R = re.compile('^(.*)(\d\d\d\d)\.xml$')
16m = R.match(lastfile)
17if not m:
18        print "No match to expected file name pattern"
19        sys.exit(1)
20prefix = m.group(1)
21number = int(m.group(2))
22print prefix, number
23
24os.chdir('data')
25while True:
26        number += 1
27        status = os.system("curl -O ftp://ftp.nlm.nih.gov/nlmdata/.medlease/gz/%s%04d.xml.gz" % (prefix, number))
28        if status !=0:
29                print >> sys.stderr, "No more files exist"
30                print >> sys.stderr, "Now run 'sudo -u med-import ./run-medline-import &'"
31                sys.exit(0)
32
Note: See TracBrowser for help on using the repository browser.