forked from globus-labs/solvation-energy-ml
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdlhub_app.py
119 lines (95 loc) · 4.97 KB
/
dlhub_app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
from keras.models import Model, load_model
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neighbors import NearestNeighbors
from jcesr_ml.mpnn import set_custom_objects, run_model, SolvationPreprocessor
from typing import List, Iterable, Dict, Optional
import pickle as pkl
import numpy as np
import gzip
import json
import os
set_custom_objects()
# Set the batch size
# TODO (wardlt): Allow ability for users to configure, important for high-throughput screening
# but not most uses
batch_size = 128
# Location for the global variables
# Large objects that I only want to read from disk once!
model: Optional[Model] = None # MPNN model used to predict solvation energy
converter: Optional[SolvationPreprocessor] = None # Tool used to compute inputs for MPNNs
error_model: Optional[LinearRegression] = None # Predicts error given distance from training set
error_model_prob: Optional[LogisticRegression] = None # Predicts likelihood of error over 1 kcal/mol
error_rep_model: Optional[Model] = None # Compute molecule representation
error_rep_nn_scaler: Optional[MinMaxScaler] = None # Scales the representation to [0, 1]
error_rep_nn_computer: Optional[NearestNeighbors] = None # Computes the the distance from the training set
error_rep_nn_dist_scale: Optional[Dict] = None # Values used to scale the distances
dielectric_values: Optional[Iterable[float]] = None # Dielectric constants used in search
def _load_model():
global model, converter, error_model, error_rep_model, error_model_prob
global error_rep_nn_scaler, error_rep_nn_computer, dielectric_values, error_rep_nn_dist_scale
if model is not None:
return
# Load in the model
model = load_model(os.path.join('model', 'best_model.h5'))
# Load in the converter
with open(os.path.join('model', 'converter.pkl'), 'rb') as fp:
converter = pkl.load(fp)
converter.atom_tokenizer.train = converter.bond_tokenizer.train = False
# Get the list of dielectric constants used to train the model
dielectric_values = converter.dielectric_cnsts
# Load in the error-model-related stuff
with open('error-models/train_dist_mol_64_mag.pkl', 'rb') as fp:
error_model = pkl.load(fp)
with open('error-models/train_dist_mol_64_prob.pkl', 'rb') as fp:
error_model_prob = pkl.load(fp)
error_rep_model = load_model('error-models/molecule-rep-model.h5')
with open('error-models/molecule-nn-scaler.pkl', 'rb') as fp:
error_rep_nn_scaler = pkl.load(fp)
with gzip.open('error-models/molecule-nn-computer.pkl.gz') as fp:
error_rep_nn_computer = pkl.load(fp)
with open('error-models/train_dist_mol_64-dist-scaling.json') as fp:
error_rep_nn_dist_scale = json.load(fp)
def evaluate_molecules(molecules: List[str], dielectric_constants: Optional[Iterable[float]] = None) -> Dict:
"""Compute the atomization energy of molecules
Args:
molecules ([str]): XYZ-format molecular structures. Assumed to be
fully-relaxed
dielectric_constants ([float]): List of dielectric constants
Returns:
([float]): Estimated G4MP2 atomization energies of molecules
"""
_load_model()
# Add default arguments
if dielectric_constants is None:
dielectric_constants = dielectric_values
# Prepare the output dict
output = dict(smiles=molecules)
# Set the dielectirc constants in the converter
converter.dielectric_cnsts = list(dielectric_constants)
if any([e < min(dielectric_values) or e > max(dielectric_values) for e in dielectric_constants]):
output['warnings'] = [f'Some dielectrics outside of range in training set: {min(dielectric_values):.2f} '
f'- {max(dielectric_values):.2f}']
# Compute the solvation energies
solv_energies = run_model(model, converter, molecules, chunk_size=batch_size)
output['solvation-energies'] = solv_energies.tolist()
output['dielectric-constants'] = converter.dielectric_cnsts
# Run the error estimation model
mol_rep = run_model(error_rep_model, converter, molecules, chunk_size=batch_size)
mol_rep_scaled = error_rep_nn_scaler.transform(mol_rep)
mol_dists = error_rep_nn_computer.kneighbors(mol_rep_scaled)[0]
mean_dists = mol_dists.mean(axis=1)
# Scale the training set distances
mean_dists = (mean_dists - error_rep_nn_dist_scale['min_dist']) / (
error_rep_nn_dist_scale['max_dist'] - error_rep_nn_dist_scale['min_dist'])
output['training-set-distance'] = mean_dists.tolist()
# Run them through the error models
output['expected-error'] = np.exp(error_model.predict(mean_dists[:, None])).tolist()
output['likelihood-error-above-1kcal/mol'] = error_model_prob.predict_proba(mean_dists[:, None])[:, 1].tolist()
return output
if __name__ == "__main__":
# Get some data
mols = ['C', 'CC', 'CCC']
# Evaluate the energies
pred = evaluate_molecules(mols)
print(json.dumps(pred, indent=2))