-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwikichomp.py
95 lines (83 loc) · 2.74 KB
/
wikichomp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import cgi
import re
import sys
import random
import urllib2
import string
import xml.dom.minidom
from google.appengine.ext import db
form = cgi.FieldStorage()
if not form.has_key("input"):
print """Content-Type: text/html
<html>
<head>
<title>[[Wikipedia|Wikichomp]] Acronymizer</title>
</head>
<body>
<form action="/" method="get">
<div><input type="text" name="input" size="60"/></div>
<div><input type="submit" value="Acronymize Word or Phrase w/ Its Wikipedia Article!"/></div>
</form>
</body>
</html>
"""
sys.exit()
class Vocab(db.Model):
title = db.StringProperty(required=True)
werdz = db.StringListProperty(required=True)
print 'Content-Type: text/plain'
print ''
term = form.getvalue("input")
agent = {'User-Agent': 'Mozilla/5.0 (F-Minus Loser Anti-Art; Me; emdash)'}
#grab the page and chomp relevant terms
req = urllib2.Request("http://en.wikipedia.org/wiki/%s" % term.replace(' ', '_'), \
headers=agent)
dom = xml.dom.minidom.parseString(urllib2.urlopen(req).read())
edit = [l.getAttribute('href') \
for l in dom.getElementsByTagName('link') \
if l.getAttribute('rel') == 'edit']
if not edit:
edit = [l.firstChild.getAttribute('href') \
for l in dom.getElementsByTagName('li') \
if l.getAttribute('id') == 'ca-viewsource']
req = urllib2.Request("http://en.wikipedia.org" + edit[0], headers=agent)
wiki_dump = urllib2.urlopen(req).read().lower()
relevance = re.compile(r'\[\[([\w\':,.()# -]+)(?:\|([\w\':,.()# -]+))?\]\]')
relephants = re.findall(relevance, wiki_dump)
#construct vocabulary
vocabulary = []
for each in relephants:
vocabulary.append(re.sub('^[^\w]', '', each[0]))
if each[1]: vocabulary.append(re.sub('^[^\w]', '', each[1]))
vocab = sorted(set(vocabulary) - set(['']))
#form a dictionary from vocabulary
acro_term = term.lower().translate(string.maketrans("",""), string.punctuation) #got from http://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string-in-python
acronym_dict = []
for element in acro_term:
if element == '_' or element == ' ':
acronym_dict.append('')
continue
acronym_equiv = []
for each in vocab:
if each[0] == element:
acronym_equiv.append(each)
acronym_dict.append(acronym_equiv)
for tick in range(len(acronym_dict)):
if acronym_dict[tick] == []:
acronym_dict[tick].append(acro_term[tick])
werds = Vocab.gql("WHERE title = :1", acro_term)
if not len(list(werds)) > 0:
werd = Vocab(title=acro_term, werdz=vocab)
werd.put()
else:
for werd in werds:
werd.werdz = sorted(set.union(set(vocab), set(werd.werdz)))
werd.put()
# now generate a random acronym from the dictionary
print acro_term + " acronymized:\n"
for each in acronym_dict:
if each == '':
print
else:
print random.choice(each).capitalize()
print "\n" + acro_term + " acronymized!\n"