-
Notifications
You must be signed in to change notification settings - Fork 54
/
Copy pathaddpaper.py
246 lines (209 loc) · 6.85 KB
/
addpaper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
# adds paper to database
# usage: python addpaper.py [name|id] [query|id]
# examples:
# python addpaper.py name title of a nice paper
# python addpaper.py id 31415926535
# this should technically be several scripts, todo later
import cPickle as pickle
import time
import string
import json
import urllib2
import urllib
import os.path
import os
import sys
if len(sys.argv) <= 2:
print "use script properly:"
print "usage: python addpaper.py [name|id] [query|id]"
print "examples:"
print "python addpaper.py name title of a nice paper"
print "python addpaper.py id 31415926535"
sys.exit(1)
if not os.path.isfile('appid.txt'):
print "OOPS! You're missing Microsoft Academic Search APP ID key in file appid.txt!"
print "See Readme.md for instructions on obtaining one."
print "Exitting."
sys.exit(1)
if not os.path.isdir('db'): os.mkdir('db')
appid= open('appid.txt', 'r').read().rstrip()
globaldb = os.path.join('db', 'papers.p')
if not os.path.isfile(globaldb): pickle.dump([], open(globaldb, "wb"))
# form the query URL to MAS
url = "http://academic.research.microsoft.com/json.svc/search?AppId=%s" % (appid, )
url += "&StartIdx=1&EndIdx=1"
url += "&ResultObjects=publication"
qtype = sys.argv[1]
if qtype == "name":
q = " ".join(sys.argv[2:])
q = q.replace(' ', '+')
url += "&TitleQuery=%s" % (q, )
elif qtype == "id":
pubid = sys.argv[2]
url += "&PublicationID=%s" % (pubid, )
else:
print "invalid query type. use [name|id]. quitting."
sys.exit(1)
# perform request
print "querying url: %s..." % (url, )
j = json.load(urllib2.urlopen(url))
if len(j['d']['Publication']['Result']) == 0:
print "No results found found! quitting!"
sys.exit(1)
# go down the results...
rix = 0
while True:
pub = j['d']['Publication']['Result'][rix] # publication json
idstr = str(pub['ID'])
dirpath = os.path.join('db', idstr)
title = pub['Title']
# print some info and ask user if this is the right paper to make sure
papers = pickle.load(open(globaldb, "rb"))
seenthis = any([pub['ID']==x['ID'] for x in papers])
havethis = os.path.isdir(dirpath)
v=""
if pub['Conference']: v=pub['Conference']
if pub['Journal']: v = pub['Journal']
print "Found a record:"
print "title: ", title
print "author: ", (", ".join(a['FirstName'] + ' ' + a['LastName'] for a in pub['Author']))
print "published in: ", v, pub['Year']
print "citations: ", pub['CitationCount']
print "have record of this: ", seenthis
print "is in library: ", havethis
isgood = raw_input("add to library? y/n: ")
if isgood=="y" or isgood=="":
break
else:
print "ok moving to the next result..."
rix+=1
if rix>=len(j['d']['Publication']['Result']):
print "that's it, not found! quitting."
sys.exit(1)
# save the information into global papers database, if we don't already have it
if not seenthis:
print "Updating papers.p global database."
papers.append(pub)
pickle.dump(papers, open(globaldb, "wb"))
# save the individual record for this paper in db/$ID/json.p
if not havethis:
print "Creating folder %s..." % (dirpath, )
os.mkdir(dirpath)
jsonpath = os.path.join(dirpath, 'json.p')
pickle.dump(pub, open(jsonpath, "wb"))
print "Writing ", jsonpath
# download both citations and references.
# Done with one loop since these are so similar
xx = ['CitationCount', 'ReferenceCount']
yy = ['Citation', 'Reference']
ff = ['citations.p', 'references.p']
for i in range(2):
maxn = pub[xx[i]]
desc = yy[i]
fname = ff[i]
doskip = False
while True:
nd = raw_input("how many top %s (up to %d) to download for %s? [empty default = all]: " % (desc, maxn, title))
if nd=="": ndi = maxn
else: ndi = int(nd)
if ndi==0:
print "ok skipping %s." % (desc, )
doskip = True
break
if ndi>maxn: ndi = maxn
if ndi>1000:
print "More than 1000 is too many. That's crazy, won't allow it."
else:
break
if doskip: continue
# form request URL and query. Page through results (only top 100 are given)
print "downloading top %d %s for %s" % (ndi, desc, title)
pubs = []
istart = 1
while True:
iend = istart + 99
if iend>ndi: iend=ndi
print "downloading %d to %d" % (istart, iend)
url = "http://academic.research.microsoft.com/json.svc/search?AppId=%s" % (appid, )
url += "&ResultObjects=Publication"
url += "&ReferenceType=%s" % (desc, )
url += "&StartIdx=%d&EndIdx=%d" % (istart, iend)
url += "&PublicationID=%s" % (idstr, )
print "querying %s ... " % (url, )
j2 = json.load(urllib2.urlopen(url))
pubs.extend(j2['d']['Publication']['Result'])
if iend>=ndi: break
istart = istart+100
# save ids
ids = [x['ID'] for x in pubs]
refPicklePath = os.path.join('db', idstr, fname)
print "writing ", refPicklePath
pickle.dump(ids, open(refPicklePath, "wb"))
# extend global papers database
papers = pickle.load(open(globaldb, "rb"))
numadded=0
for p in pubs:
if not any([p['ID']==x['ID'] for x in papers]):
papers.append(p)
numadded += 1
pickle.dump(papers, open(globaldb, "wb"))
print "wrote %d/%d new entries to papers.p pickle." % (numadded, len(pubs))
opencommand = "gnome-open"
if sys.platform == 'darwin':
opencommand = "open"
# download full PDF
pdfpath = os.path.join('db', idstr, 'paper.pdf')
urls = pub['FullVersionURL']
pdfurls = [u for u in urls if u.endswith('.pdf')]
gotit = False
print "All paper links:"
for u in urls: print u
for u in pdfurls:
print "trying to retrieve: ", u
try:
urllib.urlretrieve(u, pdfpath)
print "saved pdf at ", pdfpath
try:
print "opening the pdf using %s (%s) for your convenience to verify the download..." %(opencommand, sys.platform)
os.system(opencommand + " " + pdfpath)
except Error, e:
print "%s failed. Make sure the downloaded %s pdf is correct." % (opencommand, pdfpath, )
isok = raw_input("download good? y/n: ")
if isok=="y":
gotit = True
break
except Exception, e:
print "ERROR retrieving: ", e
if not gotit:
print "Couldn't get the paper pdf. Please download manually and save as %s." % (pdfpath, )
kk = raw_input("waiting... press key to continue")
# create thumbnails
try:
print "creating paper thumbnails..."
thumbpath = os.path.join('db', idstr, 'thumb.png')
cmd = "convert %s -thumbnail 150 -trim %s" % (pdfpath, thumbpath)
print "running: " + cmd
os.system(cmd)
except Error, e:
print "creating thumbnails failed:"
print e
# analyze the paper for top words
try:
print "running topwords.py..."
os.system("python topwords.py %s" % (idstr, ))
except Error, e:
print "topwords.py error:"
print e
try:
print "running genjson.py..."
os.system("python genjson.py")
except Error, e:
print "genjson.py error:"
print e
try:
print "running copyresources.py..."
os.system("python copyresources.py %s" % (idstr, ))
except Error, e:
print "copyresources.py error:"
print e
print "done. Open client/index.html to view library."