forked from EricSchles/backpageGrab
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape_backpage.py~
55 lines (48 loc) · 1.52 KB
/
scrape_backpage.py~
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import requests
import lxml.html
import pickle
def get_all_backpages():
r = requests.get("http://www.backpage.com/")
html = lxml.html.fromstring(r.text)
backpages = html.xpath("//a/@href")
links = []
for i in backpages:
if "backpage" in i:
if not "www" in i:
i = str(i)
links.append(i)
with open("backpages","w") as f:
pickle.dump(links,f)
def setup_all():
backpages = pickle.load(open("backpages","rb"))
female_escorts = []
body_rubs = []
strippers = []
dominatrixes = []
transsexual_escorts = []
male_escorts = []
websites = []
adult_jobs = []
for i in backpages:
female = i + "FemaleEscorts/"
female_escorts.append(female)
bodyrub = i + "BodyRubs/"
body_rubs.append(bodyrub)
stripper = i + "Strippers/"
strippers.append(stripper)
dominatrix = i + "Domination/"
transsexual = i + "TranssexualEscorts/"
male = i + "MaleEscorts/"
male_escorts.append(male)
website = i + "Datelines/"
websites.append(website)
adult = i + "AdultJobs/"
adult_jobs.append(adult)
all_pages = female_escorts + body_rubs + strippers + dominatrixes + transsexual_escorts + male_escorts + websites + adult_jobs
return all_pages
#gets all the ads on a given backpage, page
def grab_ads(page):
r = requests.get(page)
html = lxml.html.fromstring(r.text)
ads = html.xpath('//div[@class="cat"]/a/@href')
return ads