-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse_molecule.py
86 lines (62 loc) · 3.15 KB
/
parse_molecule.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# Built-in imports
import collections
from pprint import pprint
water = 'H2O'
magnesium_hydroxide = 'Mg(OH)2'
fremy_salt = 'K4[ON(SO3)2]2'
def parse_molecule(chemical_formula):
"""
Count the number of atoms of each element contained in a chemical formula and return a dict.
:param chemical_formula: Chemical formula to parse.
:type chemical_formula: basestring
:return: Dict with the counts of each atom.
:rtype: dict
"""
if not chemical_formula:
return None
formula_length = len(chemical_formula)
# An array of Counters, where each Counter represents a subformula. A subformula is a formula in between an opening
# and closing bracket. Each counter is keeping track of the subformula's atom count . Storing multiple Counters
# allows us to handle multiple and nested brackets.
subformulas_list = [collections.Counter()]
# Brackets can be round, squared or curly.
opening_brackets = ['(', '[', '{']
closing_brackets = [')', ']', '}']
main_index = 0
while main_index < formula_length:
# Iterating through the chemical formula.
if chemical_formula[main_index] in opening_brackets: # Beginning of a subformula.
subformulas_list.append(collections.Counter()) # Adding a new subformula to the subformulas list
main_index += 1
elif chemical_formula[main_index] in closing_brackets: # End of a subformula.
subformula = subformulas_list.pop() # Popping the subformula from the subformulas list.
main_index += 1
temp_index_start = main_index
while main_index < formula_length and chemical_formula[main_index].isdigit():
# Parsing the multiplicity of the subformula.
main_index += 1
# Default multiplicity is 1.
multiplicity = int(chemical_formula[temp_index_start: main_index] or 1)
for atom_name, atom_count in subformula.items():
# Applying multiplicity to each atom of the subformula.
subformulas_list[-1][atom_name] += atom_count * multiplicity
else: # Parsing an atom.
temp_index_start = main_index
main_index += 1
# Handling the case of atoms represented by multiple characters (ex: Al for Aluminium or Cl for chlorine).
while main_index < formula_length and chemical_formula[main_index].islower():
main_index += 1
atom_name = chemical_formula[temp_index_start: main_index]
temp_index_start = main_index
while main_index < formula_length and chemical_formula[main_index].isdigit():
# Parsing the multiplicty of the atom.
main_index += 1
# Default multiplicity is 1.
multiplicity = int(chemical_formula[temp_index_start: main_index] or 1)
# Adding the mulitiplicity of the atom to the subformula's Counter.
subformulas_list[-1][atom_name] += multiplicity
return subformulas_list[-1]
if __name__ == '__main__':
pprint(parse_molecule(water))
pprint(parse_molecule(magnesium_hydroxide))
pprint(parse_molecule(fremy_salt))