-
Notifications
You must be signed in to change notification settings - Fork 8
/
homescrape.in
134 lines (111 loc) · 3.48 KB
/
homescrape.in
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/usr/bin/env ruby
# vim: set sw=2 sts=2 tw=100 et nowrap fenc=utf-8 :
# Copyright 2010 Ali Polatel <[email protected]>
# Distributed under the terms of the GNU General Public License v2
%w{getoptlong net/http time uri rubygems nokogiri}.each {|m| require m }
begin
require 'chronic'
has_chronic = true
rescue LoadError
has_chronic = false
end
MYNAME = File.basename $0, ".rb"
MYVERSION = "@VERSION@" + "@GITHEAD@"
class UserNotFound < StandardError; end
class Scraper
LASTFM_URL = 'http://www.last.fm/user/%s/tracks'
LASTFM_DATE_FORMAT = '%Y-%m-%dT%H:%M:%SZ'
attr_accessor :username, :url
def initialize username
@username = username
@url = sprintf(LASTFM_URL, username)
# Set up proxy
@proxy_url = URI.parse(ENV['http_proxy']) if ENV['http_proxy']
@proxy_host = @proxy_url.host if @proxy_url and @proxy_url.host
@proxy_port = @proxy_url.port if @proxy_url and @proxy_url.port
@proxy_user, @proxy_pass = @proxy_url.userinfo.split(/:/) if @proxy_url and @proxy_url.userinfo
end
def fetch since, page=1, &block
uri = URI.parse(@url + "?page=#{page}")
req = Net::HTTP::Get.new(uri.request_uri)
res = Net::HTTP::Proxy(@proxy_host, @proxy_port,
@proxy_user, @proxy_pass).start(uri.host, uri.port) {|http|
http.request(req)
}
data = res.body
raise UserNotFound if data =~ /User not found/
doc = Nokogiri::HTML data
if page == 1
if doc.css('a.lastpage').length != 0
@lastpage = doc.css('a.lastpage')[0].content.to_i
else
@lastpage = 1
end
end
tags = doc.xpath(<<-EOF)
//tr[
td[@class="subjectCell"]
and td[@class="lovedCell"]
and td[@class="dateCell last"]
]
EOF
tags.each do |tag|
subjectCell = tag.children[2]
lovedCell = tag.children[4]
dateCell = tag.children[8]
artist = subjectCell.children[1].content
title = subjectCell.children[3].content
love = lovedCell.children[1] ? true : false
date = Date.strptime(dateCell.at('//abbr/@title').to_s, LASTFM_DATE_FORMAT)
return if since > date
block.call artist, title, love
end
if page <= @lastpage
fetch since, page + 1, &block
end
end
end
def usage out, code
out.puts <<HELP
#{MYNAME} -- import last.fm data
Usage: #{MYNAME} [OPTIONS] USERNAME
Options:
--help, -h Display help and exit
--version, -V Display version and exit
--since, -s Import data since the given date
HELP
exit code
end
def quote src
src.gsub(/'/, "''").gsub(/"/, "\\\"")
end
opts = GetoptLong.new(
[ '--help', '-h', GetoptLong::NO_ARGUMENT ],
[ '--version', '-V', GetoptLong::NO_ARGUMENT ],
[ '--since', '-s', GetoptLong::REQUIRED_ARGUMENT ])
$since = Date.parse(Time.at(0).to_s)
opts.each do |opt, arg|
case opt
when '--help'
usage($stdout, 0)
when '--version'
puts "#{MYNAME}-#{MYVERSION}"
exit 0
when '--since'
$since = has_chronic ? Date.parse(Chronic.parse(arg).to_s) : Date.parse(Time.parse(arg).to_s)
end
end
usage($stderr, 1) if ARGV.empty?
importer = Scraper.new ARGV[0]
importer.fetch($since) do |artist, title, love|
cmd = "eugene %s \"artist='#{quote(artist)}' and title='#{quote(title)}'\""
puts "* " + sprintf(cmd, 'count 1')
system sprintf(cmd, 'count 1')
if love
puts "* " + sprintf(cmd, 'love')
system sprintf(cmd, 'love')
end
cmd_artist = "eugene count --artist 1 \"name='#{quote(artist)}'\""
puts "* " + cmd_artist
system cmd_artist
end