forked from sopel-irc/sopel-extras
-
Notifications
You must be signed in to change notification settings - Fork 0
/
bookie.py
312 lines (268 loc) · 11.5 KB
/
bookie.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
# coding=utf8
"""bookie.py - Willie URL storage into bookie
Copyright 2014, Antoine Beaupré <[email protected]>
Licensed under the Eiffel Forum License 2.
This will store links found on an IRC channel into a Bookie
instance. It needs to be configured with a username/key to be
functional, per-channel configs are possible.
Bookie is an open-source bookmarking application that is hosted on
http://bookie.io/ and can also be self-hosted. It is similar in
functionality to the http://del.icio.us/ commercial service.
Bookie can be useful to store a cached copy of links mentionned on
IRC. It will also generate an RSS feed of those links automatically,
and more! The author, for example, turns those RSS feeds into ePUB
e-books that are then transfered on his e-book reader so in effect,
Bookie and this plugin create a way to read links mentionned on IRC on
his ebook reader, offline.
This plugin uses only a tiny part of the Bookie API, we could expand
functionalities here significantly:
https://github.com/bookieio/Bookie/blob/develop/docs/api/user.rst
"""
from __future__ import unicode_literals
from willie import web, tools
from willie.module import commands, rule, example
from willie.modules.url import get_hostname, url_finder, exclusion_char, title_tag_data, quoted_title, re_dcc
from willie.config import ConfigurationError
from datetime import datetime
import getpass
import json
try:
import pytz
except:
pytz = None
import re
import requests
import sys
if sys.version_info.major < 3:
import urlparse
urlparse = urlparse.urlparse
else:
import urllibe
urlparse = urllib.parse.urlparse
# an HTML tag. cargo-culted from etymology.py
r_tag = re.compile(r'<[^>]+>')
r_whitespace = re.compile(r'[\t\r\n ]+')
api_url = None
api_user = None
api_key = None
api_suffix = '/api/v1/'
api_private = None
def text(html):
'''html to text dumb converter
cargo-culted from etymology.py'''
html = r_tag.sub('', html)
html = r_whitespace.sub(' ', html)
return web.decode(html.strip())
def configure(config):
"""
| [url] | example | purpose |
| ---- | ------- | ------- |
| api_url | https://bookie.io/api/v1/admin/account?api_key=XXXXXX | template URL for the bookie instance |
| private | True | if bookmarks are private by default |
| url_per_channel | #channel:admin:XXXXXX:True | per-channel configuration |
"""
if config.option('Configure Bookie?', False):
if not config.has_section('bookie'):
config.add_section('bookie')
config.interactive_add(
'bookie',
'api_url',
'URL of the Bookie API',
'https://bookie.io/api/v1/admin/account?api_key=XXXXXX')
config.interactive_add(
'bookie',
'private',
'Mark bookmarks as private',
True)
config.interactive_add(
'bookie',
'auto',
'Automatically parse bookmarks',
False)
if config.option('Would you like to configure individual accounts per channel?', False):
c = 'Enter the API URL as #channel:account:key:private'
config.add_list('bookie', 'url_per_channel', c, 'Channel:')
def validate_private(private):
'''convert the private setting to a real bool
this is necessary because it could be the "true" string...
we consider every string but lower(true) to be false
'''
# deal with non-configured private setting
if private is None:
private = True
if (type(private) == str):
private = True if private.lower() == 'true' else False
return private
def setup(bot):
global url_finder, exclusion_char, api_url, api_key, api_user, api_private
if bot.config.bookie.api_url:
try:
# say we have "https://example.com/prefix/api/v1/admin/account?api_key=XXXXXX"
p = urlparse(bot.config.bookie.api_url)
# "https://example.com"
api_url = p.scheme + '://' + p.netloc
# "/prefix"
prefix = p.path.split(api_suffix)[0]
if prefix:
api_url += prefix
# "/api/v1/"
api_url += api_suffix
# the path element after api_suffix
# that is, "admin"
api_user = p.path.split(api_suffix)[1].split('/')[0]
# "XXXXXX"
api_key = p.query.split('=')[1]
except Exception as e:
raise ConfigurationError('Bookie api_url badly formatted: %s' % str(e))
else:
raise ConfigurationError('Bookie module not configured')
api_private = validate_private( bot.config.bookie.private)
if bot.config.has_option('url', 'exclusion_char'):
exclusion_char = bot.config.url.exclusion_char
url_finder = re.compile(r'(?u)(.*?)\s*(%s?(?:http|https|ftp)(?:://\S+)\s*(.*?))' %
(exclusion_char))
if bot.config.bookie.auto:
if not bot.memory.contains('url_callbacks'):
bot.memory['url_callbacks'] = tools.WillieMemory()
bot.memory['url_callbacks'][re.compile('.*')] = bmark
def shutdown(bot):
if bot.config.bookie.auto:
del bot.memory['url_callbacks'][re.compile('.*')]
@commands('bmark')
@example('.bmark #tag description http://example.com', '[ Example ] - example.com')
def bmark(bot, trigger):
# cargo-culted from url.py
if not trigger.group(2):
# this bookmarks the last URL seen by url.py or this module
if trigger.sender not in bot.memory['last_seen_url']:
return
urls = [bot.memory['last_seen_url'][trigger.sender]]
else:
urls = re.findall(url_finder, trigger)
process_urls(bot, trigger, urls)
@rule('(?u).*(https?://\S+).*')
def title_auto(bot, trigger):
"""Automatically show titles for URLs. For shortened URLs/redirects, find
where the URL redirects to and show the title for that (or call a function
from another module to give more information).
Unfortunate copy of modules.url.title_auto because I couldn't hook
into it.
"""
if re.match(bot.config.core.prefix + 'bmark', trigger):
return
# Avoid fetching known malicious links
if 'safety_cache' in bot.memory and trigger in bot.memory['safety_cache']:
if bot.memory['safety_cache'][trigger]['positives'] > 1:
return
urls = re.findall(url_finder, trigger)
results = process_urls(bot, trigger, urls)
def process_urls(bot, trigger, urls):
for pre, url, post in urls:
if not url.startswith(exclusion_char):
# Magic stuff to account for international domain names
try:
url = willie.web.iri_to_uri(url)
except:
pass
bot.memory['last_seen_url'][trigger.sender] = url
# post the bookmark to the Bookie API
(title, domain, resp, headers) = api_bmark(bot, trigger, url, pre+post)
if headers['_http_status'] != 200:
status = 'error from bookie API: %s' % text(resp.decode('utf-8', 'ignore'))
else:
# try to show the user when the bookmark was posted,
# so they can tell if it's new
try:
# assumes that bookie's times are UTC
timestamp = datetime.strptime(json.loads(resp)['bmark']['stored'], '%Y-%m-%d %H:%M:%S')
if pytz:
tz = tools.get_timezone(bot.db, bot.config,
trigger.nick, trigger.sender)
timestamp = tools.format_time(bot.db, bot.config, tz, trigger.nick,
trigger.sender, timestamp)
else:
timestamp += 'Z'
status = 'posted on ' + timestamp
except KeyError:
# the 'stored' field is not in the response?
status = 'no timestamp in %s' % json.loads(resp)
except ValueError as e:
if 'JSON' in str(e):
status = u'cannot parse JSON response: %s' % resp.decode('utf-8', 'ignore')
else:
raise
message = '[ %s ] - %s (%s)' % (title, domain, status)
# Guard against responding to other instances of this bot.
if message != trigger:
bot.say(message)
def api(bot, trigger, func, data=None):
global api_url, api_user, api_key
user = api_user
key = api_key
if (trigger.sender and not trigger.sender.is_nick() and
bot.config.has_option('bookie', 'url_per_channel')):
match = re.search(trigger.sender + ':(\w+):(\w+)(?::(\w+))?',
bot.config.bookie.url_per_channel)
if match is not None:
user = match.group(1)
key = match.group(2)
data['is_private'] = int(validate_private(match.group(3)))
api = '%s%s/bmark?api_key=%s' % ( api_url, user, key )
bot.debug('bookie', 'submitting to %s data %s' % (api, data), 'verbose')
# we use requests instead of web.post because Bookie expects
# JSON-encoded submissions, which web.post doesn't support
r = requests.post(api, data)
r.headers['_http_status'] = r.status_code
bot.debug('bookie', 'response: %s (headers: %s, body: %s)' % (r, r.text, r.headers), 'verbose')
return (r.text, r.headers)
def api_bmark(bot, trigger, found_match=None, extra=None):
url = found_match or trigger
bytes = web.get(url)
# XXX: needs a patch to the URL module
title = find_title(content=bytes)
if title is None:
title = '[untitled]'
data = {u'url': url,
u'is_private': int(api_private),
u'description': title.encode('utf-8'),
u'content': bytes}
if extra is not None:
# extract #tags, uniquely
# copied from http://stackoverflow.com/a/6331688/1174784
tags = {tag.strip("#") for tag in extra.split() if tag.startswith("#")}
if tags:
data['tags'] = ' '.join(tags)
# strip tags from message and see what's left
message = re.sub(r'#\w+', '', extra).strip()
if message <> '':
# something more than hashtags was provided
data['extended'] = extra
return [title, get_hostname(url)] + list(api(bot, trigger, 'bmark', data))
def find_title(url=None, content=None):
"""Return the title for the given URL.
Copy of find_title that allows for avoiding duplicate requests."""
if (not content and not url) or (content and url):
raise ValueError('url *or* content needs to be provided to find_title')
if url:
try:
content, headers = web.get(url, return_headers=True, limit_bytes=max_bytes)
except UnicodeDecodeError:
return # Fail silently when data can't be decoded
assert content
# Some cleanup that I don't really grok, but was in the original, so
# we'll keep it (with the compiled regexes made global) for now.
content = title_tag_data.sub(r'<\1title>', content)
content = quoted_title.sub('', content)
start = content.find('<title>')
end = content.find('</title>')
if start == -1 or end == -1:
return
title = web.decode(content[start + 7:end])
title = title.strip()[:200]
title = ' '.join(title.split()) # cleanly remove multiple spaces
# More cryptic regex substitutions. This one looks to be myano's invention.
title = re_dcc.sub('', title)
return title or None
if __name__ == "__main__":
from willie.test_tools import run_example_tests
run_example_tests(__file__)