-
Notifications
You must be signed in to change notification settings - Fork 37
/
Copy pathURLStripper.py
executable file
·31 lines (31 loc) · 971 Bytes
/
URLStripper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
#!/usr/bin/python
import urlparse
import re
import sys
#from BeautifulSoup import BeautifulSoup as soup
from bs4 import BeautifulSoup as soup
class URLStripper:
def __init__(self):
return
def strip(self,page):
self.page = page
results_wrapper = soup(self.page).find("div",{"id":"ires"})
if type(results_wrapper) == type(None) or type(results_wrapper.ol) == type(None):
return []
results_list = results_wrapper.ol
list_items = results_list.findAll("li",{"class":"g"})
if len(list_items) == 0:
return
#now all thats left is to get the goodies from list_itesm
results = []
for li in list_items:
anchor = li.h3.a.get('href') #we have the link, now we need to cut away all the google trash attached to it
anchor = str(anchor[7:].split("&sa=")[0])
if re.search('http://',anchor) != None:
results.append(anchor)
return results
if __name__ == "__main__":
f = open(sys.argv[1],"r")
f = f.read()
urls = URLStripper(f,'')
urls.strip()