forked from EricSchles/backpageGrab
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape_backpage.py
117 lines (98 loc) · 4.42 KB
/
scrape_backpage.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import requests
import lxml.html
import pickle
import grequests
from collections import defaultdict, namedtuple
# Page = namedTuple("Page, ['url', 'location', 'ads'])
Ad = namedtuple("Ad", ['url', 'textbody', 'picture_urls'])
def get_all_backpages():
r = requests.get("http://www.backpage.com/")
html = lxml.html.fromstring(r.text)
backpages = html.xpath("//a/@href")
links = []
for i in backpages:
if "backpage" in i:
if not "www" in i:
i = str(i)
links.append(i)
with open("backpages","w") as f:
pickle.dump(links,f)
def get_page_dict(index=2, as_list=False):
"""opens list of backpages stored in a file, generates links to relevant subdomains. returns either a dict mapping
local backpage: links, or returns a long list of all the links generated."""
backpages = pickle.load(open("backpages","rb"))
link_dict = defaultdict(list)
subdoms = ["FemaleEscorts/", "BodyRubs/", "Strippers/", "Domination/",
"TranssexualEscorts/", "MaleEscorts/", "Datelines/", "AdultJobs/"]
page_suffix = lambda subdom, page: "{0}?page={1}".format(subdom, page) if page > 1 else subdom
for page in backpages:
for i in xrange(1,index):
for subdom in subdoms:
new_link = page + page_suffix(subdom, i)
link_dict[subdom].append(new_link)
if as_list:
# set comprehension: for each list_of_links in values(): add each link in the list of links to the set
return list( {link for list_of_links in link_dict.values() for link in list_of_links} )
return link_dict
def get_page_list(index=2):
""" Generates a list of unique, relevant backpage urls on the second level. e.g. 'http://centraljersey.backpage.com/FemaleEscorts/' """
return get_page_dict(index=index, as_list=True)
def get_ad_links_from_page(page):
""" gets all the ads on a given subpage"""
r = requests.get(page)
html = lxml.html.fromstring(r.text)
ads = html.xpath('//div[@class="cat"]/a/@href')
return [str(ad) for ad in ads]
def get_page_to_ad_mapping(page_list, as_list=False):
"""Given a list of pages, return a dict of each page """
if not as_list:
return {page : get_ad_links_from_page(page) for page in page_list}
else:
return [link for page in page_list for link in get_ad_links_from_page(page)]
def get_ad_links_from_pages(page_list):
return get_page_to_ad_mapping(page_list, as_list=True)
def extract_ad_info_from_response(response):
""" Given a response to a request for an ad, extact the url, posting body, and links to the pictures"""
html = lxml.html.fromstring(response.text)
posting_body = html.xpath('//div[@class="postingBody"]')
textbody = [i.text_content() for i in posting_body]
# I don't think we're getting all images. Will need to revisit.
picture_urls = list(set(html.xpath('//ul[@id="viewAdPhotoLayout"]/li/a/@href')))
response.close()
return Ad(url=response.url, textbody=textbody, picture_urls=picture_urls)
def extract_ad_info_from_url(url):
""" wrapper around extract_ad_info_from_response"""
return extract_ad_info_from_response(requests.get(url))
def extract_info_from_ads(url_list, asynchronous=False):
if asynchronous:
rs = (grequests.get(u,stream=False) for u in url_list)
responses = grequests.map(rs)
return [extract_ad_info_from_response(response) for response in responses]
else:
return [extract_ad_info_from_response(requests.get(url)) for url in url_list]
# def run_scraper(testing=False):
# pages = get_page_link_list()
# links = []
# if testing:
# page = pages[0]
# # print page
# links.append(grab_ads(page))
# information = get_information_from_page(links[0][0])
# return information
# else:
# for page in pages[:10]:
# links += grab_ads(page)
# # print "grabbing page data..."
# #chunking requests because grequests can't handle that many at once
# url_list = []
# for i in xrange(0,len(links),10):
# url_list.append(links[i-10:i])
# data = get_information_from_page(url_list,asynchronous=True)
# print data
# data = []
# for link in links:
# data.append(get_information_from_page(link))
# return data
# # data = run_scraper()
# ks:
# data.append(get_information_from_page(link))