Merge pull request #1 from stevebritton/master

Keeping Charles busy
devopsgroup-io · Jun 7, 2017 · 43024d9 · 43024d9
2 parents 8c81a53 + 84646d4
commit 43024d9
Show file tree

Hide file tree

Showing 2 changed files with 30 additions and 4 deletions.
diff --git a/data.yml b/data.yml
@@ -198,6 +198,31 @@ Pulmonary Arterial Hypertension:
       company: Actelion
       generic: epoprostenol systemic
     regex: '[A-Z][A-Z][A-Z]-[0-9][0-9][0-9][0-9][0-9]'
+Respiratory - Chronic Obstructive Pulmonary Disease:
+  bevespi.com:
+    dates:
+      20170430:
+        code: 2047004-3340900
+    drug:
+      company: AstraZeneca
+      generic: formoterol and glycopyrrolate
+    regex: '[0-9][0-9][0-9][0-9][0-9][0-9][0-9]-[0-9][0-9][0-9][0-9][0-9][0-9][0-9]'
+  mysymbicort.com:
+    dates:
+      20170228:
+        code: 1945203-3326734
+    drug:
+      company: AstraZeneca
+      generic: budesonide and formoterol
+    regex: '[0-9][0-9][0-9][0-9][0-9][0-9][0-9]-[0-9][0-9][0-9][0-9][0-9][0-9][0-9]'
+  startwithanoro.com:
+    dates:
+      20170228:
+        code: 783658R0
+    drug:
+      company: GSK
+      generic: umeclidinium and vilanterol
+    regex: '[0-9][0-9][0-9][0-9][0-9][0-9][A-Z][0-9]'
 Rheumatoid Arthritis:
   arava.com:
     dates:
@@ -252,6 +277,6 @@ Rheumatoid Arthritis:
       20170528:
         code: 0026VMV02
     drug:
-      company: Horizon
+      company: Horizon Pharma
       generic: esomeprazole / naproxen systemic
     regex: '[0-9][0-9][0-9][0-9][A-Z][A-Z][A-Z][0-9][0-9]'
diff --git a/moai.py b/moai.py
@@ -29,7 +29,7 @@ def dict_constructor(loader, node):
 
 # find regulatory code changes
 for indication in data:
-        
+
     # what indication?
     print indication
 
@@ -41,8 +41,9 @@ def dict_constructor(loader, node):
         # get the html
         request = urllib2.Request('http://' + website, headers={'User-Agent' : "Moai"})
         html_content = urllib2.urlopen(request).read()
+
         # search for the code using the regex defined per website
-        live_matches = re.findall(data[indication][website]['regex'], html_content);
+        live_matches = re.findall(data[indication][website]['regex'], re.sub('<[^<]+?>', '', html_content));
 
         # get the most recent date
         most_recent_date = data[indication][website]['dates'].keys()[-1]
@@ -109,7 +110,7 @@ def dict_constructor(loader, node):
         plt.savefig('data/' + website.replace("/","-") + '.png', bbox_inches='tight')
 
         plt.close('all')
-        
+
         content += '\n| [{0}](http://{0}) | {1} | {2} | ![{3}](data/{3}.png) |'.format(website, data[indication][website]['drug']['company'], data[indication][website]['drug']['generic'], website.replace("/","-"))