forked from ENCODE-DCC/WranglerScripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ENCODETools.py
376 lines (345 loc) · 13.9 KB
/
ENCODETools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
import os
import sys
import csv
import json
import jsonschema
import requests
from pyelasticsearch import ElasticSearch
import xlrd
import xlwt
from base64 import b64encode
import gdata
import gdata.spreadsheet.service
# set headers. UNCLEAR IF THIS IS USED PROPERLY
HEADERS = {'content-type': 'application/json'}
def get_ENCODE(obj_id,keys):
'''GET an ENCODE object as JSON and return as dict'''
url = keys['server']+obj_id+'?limit=all'
response = requests.get(url, auth=(keys['authid'],keys['authpw']), headers=HEADERS)
if not response.status_code == 200:
print >> sys.stderr, response.text
return response.json()
def GetENCODE(object_id,keys):
'''GET an ENCODE object as JSON and return as dict'''
if type(object_id) is str:
url = keys['server']+object_id+'?limit=all'
#print(url)
try:
response = requests.get(url,auth=(keys['authid'],keys['authpw']), headers=HEADERS)
if not response.status_code == 200:
print >> sys.stderr, response.text
# no
except Exception as e:
print("Get request failed:")
#print(e)
# yes
else:
return response.json()
def patch_ENCODE(obj_id,patch_json,keys):
'''PATCH an existing ENCODE object and return the response JSON'''
url = keys['server']+obj_id
json_payload = json.dumps(patch_json)
response = requests.patch(url, auth=(keys['authid'],keys['authpw']), headers=HEADERS, data=json_payload)
print "Patch:"
print response.status_code
if not response.status_code == 200:
print >> sys.stderr, response.text
return response.json()
def replace_ENCODE(obj_id,put_json,keys):
'''PUT an existing ENCODE object and return the response JSON
'''
# if isinstance(put_json, dict):
json_payload = json.dumps(put_json)
# elif isinstance(put_json, basestring):
# json_payload = put_json
# else:
# print >> sys.stderr, 'Datatype to put is not string or dict.'
url = keys['server']+obj_id
response = requests.put(url, auth=(keys['authid'],keys['authpw']), headers=HEADERS, data=json_payload)
print json.dumps(response.json(), indent=4, separators=(',', ': '))
if not response.status_code == 200:
print >> sys.stderr, response.text
print response.text
return response.json()
def new_ENCODE(collection_id, object_json,keys):
'''POST an ENCODE object as JSON and return the resppnse JSON'''
url = keys['server'] +'/'+collection_id+'/'
json_payload = json.dumps(object_json)
response = requests.post(url, auth=(keys['authid'],keys['authpw']), headers=HEADERS, data=json_payload)
print(response.status_code)
if not response.status_code == 201:
print >> sys.stderr, response.text
return response.json()
def KeyENCODE(key_file,user_name,server_name):
'''
get keys from file
'''
key_open = open(key_file)
keys = csv.DictReader(key_open,delimiter = '\t')
for key in keys:
if (key.get('Server') == server_name) & (key.get('User') == user_name):
key_info = {}
key_info['user'] = key.get('User')
key_info['server'] = ('http://' + key.get('Server') + '.encodedcc.org')
key_info['authid'] = key.get('ID')
key_info['authpw'] = key.get('PW')
print('Identity confirmed')
key_open.close()
return(key_info)
def ReadJSON(json_file):
'''
read json objects from file
'''
json_load = open(json_file)
json_read = json.load(json_load)
json_load.close()
# if the returned json object is not a list, put it in one
if type(json_read) is dict:
json_list = []
json_list.append(json_read)
elif type(json_read) is list:
json_list = json_read
return json_list
def WriteJSON(new_object,object_file):
'''
write new json obect.
'''
# SHOULD BE MODIFIED TO CUSTOM OUTPUT FORMAT (FOR HUMAN VIEWING)
with open(object_file, 'w') as outfile:
json.dump(new_object, outfile)
outfile.close()
def ValidJSON(object_type,object_id,new_object,keys):
'''
check json object for validity
'''
# SHOULD ONLY NEED OBJECT. NEED DEF TO EXTRACT VALUE (LIKE TYPE) FROM JSON OBJECT GRACEFULLY.
# get the relevant schema
object_schema = GetENCODE(('/profiles/' + object_type + '.json'),keys)
# test the new object. SHOULD HANDLE ERRORS GRACEFULLY
try:
jsonschema.validate(new_object,object_schema)
# did not validate
except Exception as e:
print('Validation of ' + object_id + ' failed.')
print(e)
return False
# did validate
else:
# inform the user of the success
print('Validation of ' + object_id + ' succeeded.')
return True
def CleanJSON(new_object,object_schema,action):
'''
intended to fix invalid JSON. removes unexpected or unpatchable properties
'''
# DOES NOT REMOVE ITEMS THAT CAN ONLY BE POSTED
for key in new_object.keys():
if not object_schema[u'properties'].get(key):
new_object.pop(key)
elif object_schema[u'properties'][key].has_key(u'requestMethod'):
if object_schema[u'properties'][key][u'requestMethod'] is []:
new_object.pop(key)
elif action not in object_schema[u'properties'][key][u'requestMethod']:
new_object.pop(key)
return new_object
def FlatJSON(json_object,keys):
'''
flatten embedded json objects to their ID
'''
json_object = EmbedJSON(json_object,keys)
#print json_object
for key,value in json_object.items():
if type(value) is dict:
#print key,value
if json_object[key].has_key(u'@id'):
json_object[key] = json_object[key][u'@id']
elif json_object[key].has_key(u'href'):
json_object[key] = json_object[key][u'href']
if type(value) is list:
#print("Found List: " + key)
value_new = []
for value_check in value:
#print("Checking...")
if type(value_check) is dict:
#print("Found Object")
if value_check.has_key(u'@id'):
value_check = value_check[u'@id']
elif value_check.has_key(u'href'):
value_check = value_check[u'href']
#print(value_check)
value_new.append(value_check)
json_object[key] = value_new
return json_object
def EmbedJSON(json_object,keys):
'''
expand json object
'''
for key,value in json_object.items():
if (type(value) is unicode):
if (len(value) > 1):
if str(value[0]) == '/':
json_sub_object = GetENCODE(str(value),keys)
if type(json_sub_object) is dict:
#json_sub_object = EmbedJSON(json_sub_object,keys)
json_object[key] = json_sub_object
elif type(value) is list:
values_embed = []
for entry in value:
if (type(entry) is unicode):
if (len(entry) > 1):
if str(entry[0]) == '/':
json_sub_object = GetENCODE(str(entry),keys)
if type(json_sub_object) is dict:
#json_sub_object = EmbedJSON(json_sub_object,keys)
values_embed.append(json_sub_object)
if len(values_embed) is len(json_object[key]):
json_object[key] = values_embed
return json_object
def ElasticSearchJSON(server,query,object_type,hitnum):
'''
Run an elasticsearch query and return JSON objects
server: should be currently set to 'http://submit.encodedcc.org:9200'
query: a dict formatted as specified by elasticsearch.
the default match_all query is {'query': {'match_all': {}}}
object_type: the name of the object type. for example 'biosample'
this can also be a list of object types
hitnum: the maximum number of returned json objects
set this as high as you can take it (10000 will do for now)
'''
#make instance of elastic search
connection = ElasticSearch(server)
# run query on server for index
results = connection.search(query,index=object_type,size=hitnum)
# result objects are embedded in a dict of search result metrics
result_objects = results['hits']['hits']
# extract the json objects from the results
json_objects = []
for result_object in result_objects:
json_objects.append(result_object[u'_source'])
return json_objects
def FindSets(jsonobjects,query,returnset):
'''
Find a set of objects that contain a particular key value pair in any part of the set.
Input
jsonobjects: a list of JSON objects that will be searched.
This can either be a uniform collection or not, but each object
will be treated as a set.
query: a dict with key:value pair(s) to search for.
Currently, only works as an 'OR' search.
returnset: a string to indicate how to return values
'original': returns only root object
'only': returns only objects containing the match
'all': returns all objects from the set with the match
Output
foundobjects: a list of JSON objects that match the search parameters.
otherobjects: a list of JSON objects that don't match.
'''
foundobjects = []
otherobjects = []
for jsonobject in jsonobjects:
if jsonobject.has_key(u'@id'):
subfoundobjects = []
subotherobjects = []
foundobject = False
querycheck = {}
#print('Checking...')
for key,value in jsonobject.items():
if type(value) is dict:
#print('Dictionary')
#print value
[sfobjs,soobjs] = FindSets([value],query,returnset)
if sfobjs:
for sfobj in sfobjs:
subfoundobjects.append(sfobj)
if soobjs:
for soobj in soobjs:
subotherobjects.append(soobj)
elif value and (type(value) is list) and (type(value[0]) is dict):
#print('Dictionary List')
for item in value:
[sfobjs,soobjs] = FindSets([item],query,returnset)
if sfobjs:
for sfobj in sfobjs:
subfoundobjects.append(sfobj)
if soobjs:
for soobj in soobjs:
subotherobjects.append(soobj)
elif value and ((type(value) is list) and (type(value[0]) is not dict)) or (type(value) is not dict) or (type(value) is not list):
#print('Checking...')
for searchkey,searchvalue in query.items():
if searchkey in str(key):
#print 'inkey',key,value
if searchvalue in str(value):
#print 'invalue',value
querycheck.update({searchkey:searchvalue})
# CURRENTLY ONLY CHECKS FOR ANY HIT. WORKS LIKE 'OR' INSTEAD OF 'AND'.
if querycheck:
print 'Found.'
foundobject = True
if foundobject:
foundobjects.append(jsonobject)
elif subfoundobjects and ((returnset == 'all') or (returnset == 'original')):
foundobjects.append(jsonobject)
else:
otherobjects.append(jsonobject)
if subfoundobjects and ((returnset == 'all') or (returnset == 'only')):
for subfoundobject in subfoundobjects:
foundobjects.append(subfoundobject)
if subfoundobjects and subotherobjects and (returnset == 'all'):
for subotherobject in subotherobjects:
foundobjects.append(subotherobject)
else:
for subotherobject in subotherobjects:
otherobjects.append(subotherobject)
if foundobjects:
foundobjects = {foundobj['@id']:foundobj for foundobj in foundobjects}.values()
if otherobjects:
otherobjects = {otherobj['@id']:otherobj for otherobj in otherobjects}.values()
return foundobjects,otherobjects
def LoginGSheet(email,password):
'''
start a connection
'''
sheetclient = gdata.spreadsheet.service.SpreadsheetsService()
sheetclient.email = email
sheetclient.password = password
sheetclient.ProgrammaticLogin()
return sheetclient
def FindGSpreadSheet(sheetclient,spreadname):
'''
find a specific spreadsheet and get the id
'''
query = gdata.spreadsheet.service.DocumentQuery()
query.title = spreadname
query.title_exact = 'true'
spreadfeed = sheetclient.GetSpreadsheetsFeed(query=query)
if len(spreadfeed.entry) >= 1:
spreadsheet = spreadfeed.entry[0]
spreadid = spreadsheet.id.text.rsplit('/',1)[1]
else:
spreadsheet = ''
spreadid = ''
return(spreadid,spreadsheet)
def FindGWorkSheet(sheetclient,spreadid,workname):
'''
find a specific worksheet and get the id
'''
query = gdata.spreadsheet.service.DocumentQuery()
query.title = workname
query.title_exact = 'true'
workfeed = sheetclient.GetWorksheetsFeed(spreadid,query=query)
if len(workfeed.entry) >= 1:
worksheet = workfeed.entry[0]
workid = worksheet.id.text.rsplit('/',1)[1]
else:
worksheet = ''
workid = ''
return(workid,worksheet)
def FindGSheetCells(sheetclient,spreadid,workid):
'''
find specified cells (currently returns all, including empty)
'''
query = gdata.spreadsheet.service.CellQuery()
query.return_empty = "true"
cells = sheetclient.GetCellsFeed(spreadid,workid,query=query).entry
return(cells)