-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdata_to_array.py
152 lines (123 loc) · 5.96 KB
/
data_to_array.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt
import os
from General_functions import ML_database
import glob
import selfies as sf
import sys
class MachineLearningInput():
"""INPUT: keep this order: [name molecules] , [fraction molecules], Temperature, Pressure"""
def __init__(self, *args):
supported_molecules = ["C7","2mC6","3mC6","22mC5","23mC5","24mC5","33mC5", "3eC5", "223mC4"]
for item in args[0]:
if str(item) not in supported_molecules:
sys.exit('Input a supported molecule: "C7","2mC6","3mC6","22mC5","23mC5","24mC5","33mC5", "3eC5", "223mC4"')
try:
self.molecules = args[0]
except:
sys.exit("Give input like this: MachineLearningInput([name molecules] , [fraction molecules], Temperature, Pressure)")
try:
if len(args[1]) != len(args[0]):
sys.exit("Not for all molecules are fractions given, exit program.")
else:
self.fractions = args[1]
except:
"Give input like this: [name molecules] , [fraction molecules], Temperature, Pressure"
try:
self.temprature = args[2]
except:
print("No temperature given, default initialization: T = 300K")
self.temprature = 300
try:
self.pressure = args[3]
except:
print("No pressure given, default initialization: p = 1e4 Pa")
self.pressure = 1e4
def Print(self):
"""Prints properties of object"""
print(f"Set pressure: p = {self.pressure} Pa\nSet temprature: T = {self.temprature}K")
print("Molecules, Fraction")
for idx in range(len(self.molecules)):
print(str(self.molecules[idx]) + ", " + str(self.fractions[idx]))
def SelfiesDatabase(self):
"""Returns a directory of all possible selfies codes"""
"Creating the smiles and name arrays"
molecule_names = ['C7',"2mC6",'3mC6','22mC5',"23mC5",'24mC5',"33mC5", "3eC5", "223mC4"]
smiles_dataset = ["CCCCCCC", "CCCCC(C)C" ,"CCCC(C)CC" , "CCCC(C)(C)C" ,"CCC(C)C(C)C" , "CC(C)CC(C)C" ,"CCC(C)(C)CC","CCC(CC)CC" ,"CC(C)C(C)(C)C"]
selfies_dataset = list(map(sf.encoder, smiles_dataset)) #transforming to selfies
#print(selfies_dataset)
max_len = max(sf.len_selfies(s) for s in selfies_dataset)
symbols = sf.get_alphabet_from_selfies(selfies_dataset) # creating symbols for each character that is in the database
symbols.add("[nop]") # this is an padding symbol, otherwise it does not work
vocab_stoi = {symbol: idx for idx, symbol in enumerate(symbols)} #giving idx to each symbol
"creating dictionary for storage"
molecular_database = {}
for first, name in zip(selfies_dataset,molecule_names):
'Creating one_hot encoding'
one_hot = np.array(sf.selfies_to_encoding(first, vocab_stoi, pad_to_len =max_len)[1])
one_hot = one_hot.reshape(one_hot.shape[1]*one_hot.shape[0])#rescaling into a vector
"Adding vector to the dictionary with name"
molecular_database[name] = one_hot
return molecular_database
def Add_Selfies(self):
"""Adds the Selfies of the used molecules considering fractions"""
selfies_dict = self.SelfiesDatabase()
temp = 0
for idx, item in enumerate(self.molecules):
temp+= self.fractions[idx]*selfies_dict[item]
self.selfies = temp
obj = MachineLearningInput(["C7", "23mC5"], [0.7,0.3])
obj.Print()
obj.Add_Selfies()
#Convert txt files to np arrays
# directory="2mC6"
# file=directory + "-400out.txt"
# filename= directory + "/" + file
# molecule=np.genfromtxt(filename,delimiter=",",names=True,usecols=(0,1,2,3,4))
# p=molecule['pressure']
# muc=molecule['muc']
# print(type(molecule))
# plt.figure()
# plt.title(file[:-7]+' molecules per unit cell')
# plt.xlabel('Pressure')
# plt.ylabel('muc')
# plt.semilogx(p,muc,'ko',linestyle='dashed')
# plt.show()
# def data_gathering(path_to_output,molecular_database):
# data={}
# outputmaps = os.listdir(path_to_output)
# for outputmap in outputmaps:
# mappath = path_to_output + "/" + str(outputmap)
# if os.path.isdir(mappath):
# files = os.listdir(mappath)
# for file in files:
# #try:
# paths = mappath + "/" + str(file)
# label = file.split("out")[0]
# pressure,molkg,molkg_err=np.loadtxt(paths,delimiter=',',skiprows=1,usecols=(0,3,4),unpack=True)
# dt=np.dtype([('selfie',np.ndarray),('pressure',np.float64),('temps',int),('molkg',np.float64),('molkg_err',np.float64)])
# data_array= np.zeros(len(pressure), dtype=dt)
# temps = int(label[-3:])*np.ones(len(pressure))
# data_array['selfie']=np.repeat(molecular_database[label[:-4]][:,np.newaxis],len(pressure),axis=1).T
# data_array['pressure']=pressure
# data_array['temps']=temps
# data_array['molkg']=molkg
# data_array['molkg_err']=molkg_err
# print(data_array)
# data[label]=data_array
# #data[label]=np.append(molecular_database,data_array)
# return data
# def get_data(path_to_output):
# paths = glob.glob(path_to_output + "/*.txt")
# for path in paths:
# simulation = path.split("/")[-1].split("out.txt")
# molecule = simulation[:-3]
# temperature = simulation[-3:]
# path_to_out='Raspa/outputs'
# chemstructure=ML_database()
# dt=np.dtype([('selfie',np.ndarray)])
# arr=np.zeros(100,dtype=dt)
# c=np.repeat(chemstructure['C7'][:,np.newaxis],100,axis=1)
# data=data_gathering(path_to_out, chemstructure)