-
Notifications
You must be signed in to change notification settings - Fork 1
/
scraper.py
80 lines (71 loc) · 3.49 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
## Import packages
import requests
import re
import os
import lxml.html
import pandas as pd
pd.set_option('max_colwidth',120)
pd.set_option('display.max_row', 100)
import numpy as np
from itertools import chain
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import urllib.request as ur
from html.parser import HTMLParser
import time
## Setting base URLs and target URLs
base = "http://www.reed.co.uk"
url = "http://www.reed.co.uk/jobs?datecreatedoffset=Today"
## Defining function all_urls() for grabbing links for all pages on reed's website for the day
def all_urls():
r = requests.get(url).content
soup = BeautifulSoup(r, "html.parser")
# get the urls from the first page
yield [urljoin(base, a["href"]) for a in soup.select("div.details h3.title a[href^=/jobs]")]
nxt = soup.find("a", title="Go to next page")
# title="Go to next page" is missing when there are no more pages
while nxt:
# wash/repeat until no more pages
r = requests.get(urljoin(base, nxt["href"])).content
soup = BeautifulSoup(r, "html.parser")
yield [urljoin(base, a["href"]) for a in soup.select("div.details h3.title a[href^=/jobs]")]
nxt = soup.find("a", title="Go to next page")
## Calling function and extracting links to each vacancy
dflink = pd.DataFrame(columns=["links"], data=list(chain.from_iterable(all_urls())))
## Define function strip_html()
def strip_html(x):
return lxml.html.fromstring(x).text_content() if x else ' '
## Initializing empty dataframe
emptydata = pd.DataFrame({"job_description":[], "salary_disp":[], "salary_min":[], "salary_max":[], "salary_time":[], "job_country":[], "job_region":[], "job_locality":[], "job_postcode":[], "job_type":[], "job_type_disp":[], "applications_ten":[], "link":[]})
## Filling out dataframe with a description + other columns for each vacancy
for index, row in dflink.iloc[0:dflink.size].iterrows():
url = row['links']
s = ur.urlopen(url).read()
soup = BeautifulSoup(s, "html.parser")
description = strip_html(str(soup.find_all("span", itemprop="description")))
try:
get_a_new_job = pd.DataFrame({
"job_description":[description],
"salary_disp":[soup.select_one('.salary span').text.strip()],
"salary_min":[soup.select_one('.salary meta[itemprop="minValue"]')["content"]],
"salary_max":[soup.select_one('.salary meta[itemprop="maxValue"]')["content"]],
"salary_time":[soup.select_one('.salary meta[itemprop="unitText"]')["content"]],
"job_country":[soup.select_one('.location span[id="jobCountry"]')["value"]],
"job_region":[soup.select_one('.location meta[itemprop="addressRegion"]')["content"]],
"job_locality":[soup.select_one('.location span[itemprop="addressLocality"]').text],
"job_postcode":[soup.select_one('.location meta[itemprop="postalCode"]')["content"]],
"job_type":[soup.select_one('.time span[itemprop="employmentType"]')["content"]],
"job_type_disp":[soup.select_one('.time span[itemprop="employmentType"]').text],
"applications_ten":[soup.select_one('.applications').text.strip()],
"link": [row]
})
emptydata = emptydata.append(get_a_new_job)
time.sleep(1)
except:
pass
## Rename dataframe
reed_data = emptydata
## How many listings have we scraped?
reed_data.size
## Inspect listings as pd frame
reed_data