-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrchive.py
330 lines (277 loc) · 11.4 KB
/
rchive.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
from datetime import datetime
from loguru import logger
import configparser
import argparse
import dataset
import praw
import sys
import os
import re
# ArgumentParser Setup
parser = argparse.ArgumentParser()
# Todo – Clean up these arguments with subparsers
parser.add_argument('-c', '--config', default='config.ini', help='Config file to be loaded (default = "config.ini")')
parser.add_argument('-v', '--verbose', help='Set the console logger level to DEBUG', action='store_true')
parser.add_argument('-e', '--export', help='rchive by default only preserves self posts and comments. In order to archive full URLs and media, you can export the URLs from your rchive database and send them to something like ArchiveBox, Shaarli, etc.', action='store_true')
parser.add_argument('-f', '--format', help='Format to export URLs to (json, text) (Used in conjunction with --export)')
parser.add_argument('--skip-archive', help='Skip archive - Use with --export to export database without archiving new saved/upvoted (won\'t connect to PRAW)', action='store_true')
parser.add_argument('--include-comment-urls', help='Also include URLs which have been regex\'d out of comments (Optionally used in conjunction with --export)', action='store_true')
parser.add_argument('--include-selftext-urls', help='Also include URLs which have been regex\'d out of selftext posts (Optionally used in conjunction with --export)', action='store_true')
parser.add_argument('--use-new-reddit', help='Use new reddit to generate permalinks', action='store_true')
args = parser.parse_args()
# Logging Setup
logger.remove()
if args.verbose:
logger.add(sys.stderr, level="DEBUG")
else:
logger.add(sys.stderr, level="INFO")
logger.add("rchive.log", level="DEBUG", rotation="5 MB") # File is always set to debug, 5MB rotation because he's a big boi
class rchive:
def __init__(self, config):
logger.debug('Using config file {}'.format(config))
cp = configparser.ConfigParser()
cp.read(config)
self.reddit = self.login(cp)
self.db = self.get_database(cp)
def archive_all(self):
"""Runs a full export of saved and upvoted submissions"""
self.archive_saved(rate_limit=None)
self.archive_upvoted(rate_limit=None)
# Todo - archive functions for submissions/comments, probably from PushShift so that timesearch can take care of the rest
def login(self, config):
"""Auth with Reddit via PRAW"""
client_id = config.get("rchive", "client_id")
client_secret = config.get("rchive", "client_secret")
username = config.get("rchive", "username")
password = config.get("rchive", "password")
user_agent = config.get("rchive", "user_agent")
try:
reddit = praw.Reddit(client_id=client_id,
client_secret=client_secret,
user_agent=user_agent,
username=username,
password=password)
logger.success('Logged in as: {}'.format(reddit.user.me()))
except Exception as e:
logger.exception(e)
return reddit
def get_database(self, config):
"""Returns a dataset db object for use with rchive"""
# First check to see if theres a db url in the config
if config.has_option('rchive', 'database'):
url = config.get('rchive', 'database')
else:
# Build the database URL for a SQLite database using the reddit users name
username = str(self.reddit.user.me())
url = 'sqlite:///{}.db'.format(username)
logger.debug('Attempting to use db url {}'.format(url))
return dataset.connect(url)
def process_submissions(self, submissions, origin):
"""Processes PRAW:Submissions (comments and posts) into the database"""
logger.info('Processing {} items...'.format(origin))
posts = self.db['posts']
comments = self.db['comments']
origin_log = self.db[origin]
count = 0 # Count number of submissioons
new_count = 0 # Count number of new db entries
for submission in submissions:
count+=1
logger.debug('Processing submission: {}'.format(submission.id))
# Handle comments
if isinstance(submission, praw.models.reddit.comment.Comment):
if not comments.find_one(idint=Utils.b36(submission.id)):
logger.debug('\t Inserting comment into database')
self.insert_comment(submission)
else:
logger.debug('\t Skipping comment')
# Handle posts
elif isinstance(submission, praw.models.reddit.submission.Submission):
if not posts.find_one(idint=Utils.b36(submission.id)):
logger.debug('\t Inserting post into database')
self.insert_post(submission)
else:
logger.debug('\t Skipping post')
if not origin_log.find_one(idint=Utils.b36(submission.id)):
origin_log.insert(dict(idint=Utils.b36(submission.id)))
new_count+=1
logger.info('Processed {} {} items ({} new)'.format(str(count), origin, str(new_count)))
def insert_comment(self, comment):
"""Inserts comment into the database"""
if comment.author is None:
author = '[DELETED]'
else:
author = comment.author.name
# Follows timesearch's database format
# See: https://github.com/voussoir/timesearch/blob/master/timesearch/tsdb.py#L413
comment_data = {
'idint': Utils.b36(comment.id),
'idstr': comment.fullname,
'created': comment.created_utc,
'author': author,
'parent': comment.parent_id,
'submission': comment.link_id,
'body': comment.body,
'score': comment.score,
'subreddit': comment.subreddit.display_name,
'distinguish': comment.distinguished,
'textlen': len(comment.body)
}
self.db['comments'].insert(comment_data)
def insert_post(self, post):
"""Insert post into the database"""
if post.author is None:
author = '[DELETED]'
else:
author = post.author.name
if post.is_self:
url = None
else:
url = post.url
# Follows timesearch's database format
# See: https://github.com/voussoir/timesearch/blob/master/timesearch/tsdb.py#L351
post_data = {
'idint': Utils.b36(post.id),
'idstr': post.fullname,
'created': post.created_utc,
'self': post.is_self,
'nsfw': post.over_18,
'author': author,
'title': post.title,
'url': url,
'selftext': post.selftext,
'score': post.score,
'subreddit': post.subreddit.display_name,
'distinguish': post.distinguished,
'textlen': len(post.selftext),
'num_comments': post.num_comments,
'flair_text': post.link_flair_text,
'flair_css_class': post.link_flair_css_class
}
self.db['posts'].insert(post_data)
def archive_saved(self, rate_limit=1000):
logger.debug('Grabbing {} saved posts'.format(rate_limit if rate_limit else 'max'))
saved_posts = self.reddit.user.me().saved(limit=rate_limit)
self.process_submissions(saved_posts, 'saved')
def archive_upvoted(self, rate_limit=1000):
logger.debug('Grabbing {} upvoted posts'.format(rate_limit if rate_limit else 'max'))
upvoted_posts = self.reddit.user.me().upvoted(limit=rate_limit)
self.process_submissions(upvoted_posts, 'upvoted')
class Export:
def __init__(self, config):
logger.debug('Using config file {}'.format(config))
cp = configparser.ConfigParser()
cp.read(config)
self.db = self.get_database(cp)
def get_database(self, config):
"""Returns a dataset db object for use with Export"""
# First check to see if theres a db url in the config
url = config.get('rchive', 'database')
logger.debug('Attempting to use db url {}'.format(url))
return dataset.connect(url)
def export_to_format(self, file_format):
logger.debug('Attempting export to format {}'.format(file_format))
# Export URLs to text file
if file_format == 'txt' or file_format == 'text':
self.export_to_text()
# Export all available submission info to JSON
elif file_format == 'json':
logger.error('JSON export not yet supported')
else:
logger.error('Unknown export format {}'.format(file_format))
return
def export_to_text(self):
logger.info('Exporting submission URLs to text file...')
filename = 'export_{}.txt'.format(datetime.now().strftime("%m-%d-%Y_%I-%M-%S_%p"))
# Export posts
for post in self.db['posts'].all():
with open(filename, 'a') as f:
# Handle link posts
if post['url']:
f.write(post['url'] + '\n')
# Handle selftext posts
else:
f.write(Utils.build_permalink(post, 'post') + '\n')
if args.include_selftext_urls:
# Also export links extracted from selftext
for url in Utils.regex_urls(post['selftext']):
f.write(url + '\n')
# Export comments
for comment in self.db['comments'].all():
with open(filename, 'a') as f:
f.write(Utils.build_permalink(comment, 'comment') + '\n')
if args.include_comment_urls:
# Also export links extracted from comment body
for url in Utils.regex_urls(comment['body']):
f.write(url + '\n')
# Get number of lines written to text file
line_count = 0
for line in open(filename).readlines():
line_count += 1
logger.info('Exported {} items to {}'.format(line_count, filename))
class Utils:
@staticmethod
def build_permalink(submission, submission_type):
"""Takes a comment/post dictionary from dataset and returns a permalink to the submission"""
idstr = submission['idstr'].split('_')[-1] # Remove t#_ prefix from submission idstr
# Handle comments
if submission_type == 'comment':
parent_post_idstr = submission['submission'].split('_')[-1] # Remove prefix from submission idstr
permalink = 'https://old.reddit.com/r/{}/comments/{}//{}/'.format(submission['subreddit'], parent_post_idstr, idstr) # Fuck new reddit
# Handle posts
elif submission_type == 'post':
permalink = 'https://old.reddit.com/r/{}/comments/{}/'.format(submission['subreddit'], idstr) # Fuck new reddit
if args.use_new_reddit:
permalink = permalink.replace('https://old.', 'https://')
return permalink
@staticmethod
def regex_urls(string):
urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', string)
# Todo - Regular expressions make my brain hurt so rather than killing more of my already very few remaining brain cells, waste some memory and time and clean up any trailing symbols from all of the urls
cleaned_urls = []
for url in urls:
if ')' in url:
cleaned_urls.append(url.split(')', 1)[0])
else:
cleaned_urls.append(url)
return cleaned_urls
# B36 conversion functions thanks to voussoir:
# https://github.com/voussoir/timesearch/blob/master/timesearch/common.py#L35-L57
@staticmethod
def b36(i):
if isinstance(i, int):
return Utils.base36encode(i)
return Utils.base36decode(i)
@staticmethod
def base36decode(number):
return int(number, 36)
@staticmethod
def base36encode(number, alphabet='0123456789abcdefghijklmnopqrstuvwxyz'):
"""Converts an integer to a base36 string."""
if not isinstance(number, (int)):
raise TypeError('number must be an integer')
base36 = ''
sign = ''
if number < 0:
sign = '-'
number = -number
if 0 <= number < len(alphabet):
return sign + alphabet[number]
while number != 0:
number, i = divmod(number, len(alphabet))
base36 = alphabet[i] + base36
return sign + base36
if __name__ == '__main__':
# Ironically I dont believe rchive should archive by default, in the future I'd like to handle different functions the way timesearch does
# i.e. `python rchive.py archive OPTIONS`,
# `python rchive.py export OPTIONS`,
# `python rchive.py` <-- Would simply print --help
# etc.
if args.skip_archive:
logger.warning('--skip-archive passed, not archiving any saved/upvoted posts, not authing with PRAW')
else:
r = rchive(config=args.config)
r.archive_all()
if args.export:
logger.info('--export passed, attempting to create export')
e = Export(config=args.config)
e.export_to_format(args.format.lower())