-
Notifications
You must be signed in to change notification settings - Fork 0
/
fetch_reddit_bot.py
132 lines (107 loc) · 4.92 KB
/
fetch_reddit_bot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import requests
import csv
import time
import sqlite3
from bs4 import BeautifulSoup
# Connect to the database.
def sql_connection():
contact = sqlite3.connect('SubredditDatabase.db')
return contact
# Creating the database tables.
def sql_table(contact):
cur = contact.cursor()
cur.execute("CREATE TABLE IF NOT EXISTS posts(SUBREDDIT text, TAG text, "
" TITLE text, AUTHOR text, TIMESTAMP text, UPVOTES int, "
" COMMENTS text, URL text)")
contact.commit()
# Insert recorded data into the sql query.
def sql_insert_table(contact, entities):
cur = contact.cursor()
cur.execute('INSERT INTO posts(SUBREDDIT, TAG, TITLE, AUTHOR, '
'TIMESTAMP, UPVOTES, COMMENTS, URL) '
'VALUES(?, ?, ?, ?, ?, ?, ?, ?)', entities)
contact.commit()
# Scrape the subreddit to find the correct information.
def scraper():
contact = sql_connection()
sql_table(contact)
while 1:
subreddit = input('\n\nEnter the name of the subreddit: r/').lower()
max_count = int(input('Enter the maximum number of entries to collect: '))
select = int(input('Select tags to add for the search: \n1. hot\n2. new'
'\n3. rising\n4. top\n5. Make your choice: '))
if select == 1:
tag = 'hot'
tag_url = '/'
elif select == 2:
tag = 'new'
tag_url = '/new/'
elif select == 3:
tag = 'rising'
tag_url = '/rising/'
elif select == 4:
tag = 'top'
tag_url = '/controversial/'
elif select == 5:
tag = 'top'
tag_url = '/top/'
url = 'https://old.reddit.com/r/' + subreddit
headers = {'User-Agent': 'Mozilla/5.0'}
required = requests.get(url, headers = headers)
if required.status_code == 200:
soup = BeautifulSoup(required.text, 'html.parser')
print('Collecting reddit information for r/{subreddit}...')
attributes = {'class': 'thing'}
counter = 1
full = 0
reddit_info = []
while 1:
for posts in soup.find_all('div', attributes = attributes):
try:
# To obtain the post title
title = posts.find('a', class_='title').text
# To get the username of the post author
author = posts.find('a', class_='author').text
# To obtain the time of the post
time_stamp = posts.Time.attributes['title']
# To obtain the number of comments on the post
comments = posts.find('a', class_='comments').text.split()[0]
if comments == 'comments':
comments = 0
# To get upvotes on the post
upvotes = posts.find('div', class_='score likes').text
if upvotes == '.':
upvotes = "None"
link = posts.find('a', class_='title')['href']
link = 'www.reddit.com' + link
# Entering all the collected information into our database
entities = (subreddit, tag, title, author, time_stamp, upvotes,
comments, link)
sql_insert_table(contact, entities)
if counter == max_count:
full = 1
break
counter += 1
except AttributeError:
continue
if full:
break
try:
next_button = soup.find('span', class_='next_button')
next_page = next_button.find('a').attributes['href]
time.sleep(2)
required = requests.get(next_page_link, headers=headers)
soup = BeautifulSoup(required.text, 'html.parser')
except:
break
print('Finished\n')
answer = input('Press (y) to continue or (n) to exit: ').lower()
if answer == 'y':
continue
else if answer == 'n':
print('Exiting')
break
else:
print('Error occured')
if __name__ == '__main__':
scarper()