forked from mtrovo/wg-gesucht-scraper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
37 lines (31 loc) · 930 Bytes
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import requests
from pyquery import PyQuery as pq
from collections import namedtuple
class Offer(object):
pass
lastid=''
try:
with open('lastid.tmp', 'r') as f:
lastid = f.read().strip()
except:
pass
resp = requests.get('http://www.wg-gesucht.de/en/wohnungen-in-Berlin.8.2.0.0.html?filter=97d0cd356b0f749d3524d653301d91be410257280986c8c4ba')
d = pq(resp.text)
trs = d('#table-compact-list tr').not_('.inlistTeaser')[2:]
ids = [tr.attrib['adid'] for tr in trs]
try:
lastpos = ids.index(lastid)
except:
lastpos = len(ids)
if lastpos == 0:
exit(1)
else:
trs = trs[:lastpos]
ids = ids[:lastpos]
def parse_tr(tr):
tds = [td.text_content().strip() for td in e.findall('td')][2:]
tds.append('http://wg-gesucht.de/' + tr.attrib['adid'])
return tds
data = [parse_tr(e) for e in trs]
print '\n'.join('\t'.join(e) for e in data).encode('utf-8')
open('lastid.tmp', 'w').write(ids[0])