From a5c394682a295b3cece35f66a23609e79e44f542 Mon Sep 17 00:00:00 2001 From: James Turnbull Date: Sun, 17 Jan 2016 11:10:10 -0500 Subject: [PATCH 01/11] Added reference to Riemann GitHub account --- README.markdown | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.markdown b/README.markdown index db902127..047c4589 100644 --- a/README.markdown +++ b/README.markdown @@ -9,8 +9,10 @@ testing. I've got a whole bunch of these internally for monitoring Redis, Riak, queues, etc. Most have internal configuration dependencies, so it'll be a while before I can extract them for re-use. +See additional programs in the [Riemann GitHub account](https://github.com/riemann/). + Get started -========== +=========== ``` bash gem install riemann-tools From 0854910c0a305b5159c7d28f463288ffe37832e2 Mon Sep 17 00:00:00 2001 From: James Turnbull Date: Sun, 17 Jan 2016 11:17:33 -0500 Subject: [PATCH 02/11] Split Riemann Docker out to https://github.com/riemann/riemann-docker --- Rakefile.rb | 1 - bin/riemann-docker-health | 204 -------------------------------------- 2 files changed, 205 deletions(-) delete mode 100755 bin/riemann-docker-health diff --git a/Rakefile.rb b/Rakefile.rb index 2917aa48..f8bc8724 100644 --- a/Rakefile.rb +++ b/Rakefile.rb @@ -28,7 +28,6 @@ s.add_dependency 'fog', '>= 1.4.0' s.add_dependency 'faraday', '>= 0.8.5' s.add_dependency 'nokogiri', '>= 1.5.6' - s.add_dependency 'docker-api', '>= 1.22.0' s.files = FileList['lib/**/*', 'bin/*', 'LICENSE', 'README.markdown'].to_a s.executables |= Dir.entries('bin/') diff --git a/bin/riemann-docker-health b/bin/riemann-docker-health deleted file mode 100755 index 256abd86..00000000 --- a/bin/riemann-docker-health +++ /dev/null @@ -1,204 +0,0 @@ -#!/usr/bin/env ruby - -# Reports current CPU, disk, load average, and memory use to riemann. - -require File.expand_path('../../lib/riemann/tools', __FILE__) -require 'docker' -require 'socket' - -class Riemann::Tools::DockerHealth - include Riemann::Tools - include Docker - - opt :docker_host, "Docker Container Host (see https://github.com/swipely/docker-api#host)", :default => nil - opt :cpu_warning, "CPU warning threshold (fraction of total jiffies)", :default => 0.9 - opt :cpu_critical, "CPU critical threshold (fraction of total jiffies)", :default => 0.95 - opt :disk_warning, "Disk warning threshold (fraction of space used)", :default => 0.9 - opt :disk_critical, "Disk critical threshold (fraction of space used)", :default => 0.95 - opt :memory_warning, "Memory warning threshold (fraction of RAM)", :default => 0.85 - opt :memory_critical, "Memory critical threshold (fraction of RAM)", :default => 0.95 - opt :host_hostname, "Suffix of host", :type => String, :default => nil - opt :checks, "A list of checks to run.", :type => :strings, :default => ['cpu', 'memory', 'disk', 'basic'] - - def get_containers - Docker::Container.all - end - - def get_container_name(container) - container.json['Name'][1..-1] - end - - def initialize - - if (opts[:docker_host] != nil) - Docker.url = opts[:docker_host] - end - - @hostname = opts[:host_hostname] - if (@hostname.nil? || !(@hostname.is_a? String) || @hostname.empty?) - @hostname = Socket.gethostname - end - - @cpu_coefficient = 1000 * 1000 * 1000 - - @limits = { - :cpu => {:critical => opts[:cpu_critical], :warning => opts[:cpu_warning]}, - :disk => {:critical => opts[:disk_critical], :warning => opts[:disk_warning]}, - :memory => {:critical => opts[:memory_critical], :warning => opts[:memory_warning]} - } - - @last_cpu_reads = Hash.new - @last_uptime_reads = Hash.new - - opts[:checks].each do |check| - case check - when 'disk' - @disk_enabled = true - when 'cpu' - @cpu_enabled = true - when 'memory' - @memory_enabled = true - when 'basic' - @basic_inspection_enabled = true - end - end - end - - def alert(container, service, state, metric, description) - - opts = { :service => service.to_s, - :state => state.to_s, - :metric => metric.to_f, - :description => description } - - if (container != nil) - opts[:host] = "#{@hostname}-#{container}" - else - opts[:host] = @hostname - end - - report(opts) - end - - def report_pct(container, service, fraction, report = '', name = nil) - if fraction - - if (name == nil) - name = service - end - - if fraction > @limits[service][:critical] - alert container, name, :critical, fraction, "#{sprintf("%.2f", fraction * 100)}% #{report}" - elsif fraction > @limits[service][:warning] - alert container, name, :warning, fraction, "#{sprintf("%.2f", fraction * 100)}% #{report}" - else - alert container, name, :ok, fraction, "#{sprintf("%.2f", fraction * 100)}% #{report}" - end - end - end - - - def cpu(id, name, stats) - - current = stats['precpu_stats']['cpu_usage']['total_usage'] / stats['precpu_stats']['cpu_usage']['percpu_usage'].count - - unless current - alert name, :cpu, :unknown, nil, 'no total usage found in docker remote api stats' - return false - end - - current_time = Time.parse(stats['read']); - if (@last_cpu_reads[id] != nil) - last = @last_cpu_reads[id] - used = (current - last[:v]) / (current_time - last[:t]) / @cpu_coefficient - - report_pct name, :cpu, used - end - - @last_cpu_reads[id] = { v: current, t: current_time } - end - - def memory(id, name, stats) - memory_stats = stats['memory_stats'] - usage = memory_stats['usage'].to_f - total = memory_stats['limit'].to_f - fraction = (usage / total) - - report_pct name, :memory, fraction, "#{usage} / #{total}" - end - - def disk - `df -P`.split(/\n/).each do |r| - f = r.split(/\s+/) - next if f[0] == 'Filesystem' - next unless f[0] =~ /\// # Needs at least one slash in the mount path - - # Calculate capacity - x = f[4].to_f/100 - report_pct(nil, :disk, x, "#{f[3].to_i / 1024} mb left", "disk #{f[5]}") - end - end - - def basic_inspection(id, name, inspection) - - state = inspection['State'] - json_state = JSON.generate(state) - - running = state['Running'] - - alert(name, "status", - running ? "ok" : "critical", - running ? 1 : 0, - json_state) - - if (running) - start_time = DateTime.rfc3339(state['StartedAt']).to_time.utc.to_i - now = DateTime.now.to_time.utc.to_i - uptime = now - start_time - - if (@last_uptime_reads[id] != nil) - last = @last_uptime_reads[id] - restarted = start_time != last - alert(name, "uptime", - restarted ? "critical" : "ok", - uptime, - "last 'StartedAt' measure was #{last} (#{Time.at(last).utc.to_s}), " + - "now it's #{start_time} (#{Time.at(start_time).utc.to_s})") - end - - @last_uptime_reads[id] = start_time - end - end - - def tick - - # Disk is the same in every container - if @disk_enabled - disk() - end - - # Get CPU, Memory and Load of each container - containers = get_containers() - containers.each do |container| - - id = container.id - name = get_container_name(container) - - stats = Docker::Util.parse_json(container.connection.get("/containers/#{id}/stats", {stream:false})) - - if @basic_inspection_enabled - inspection = Docker::Util.parse_json(container.connection.get("/containers/#{id}/json")) - basic_inspection(id, name, inspection) - end - if @cpu_enabled - cpu(id, name, stats) - end - if @memory_enabled - memory(id, name, stats) - end - end - - end -end - -Riemann::Tools::DockerHealth.run From 931ade92ee6260cacee15cf10c458721a121602b Mon Sep 17 00:00:00 2001 From: James Turnbull Date: Sun, 17 Jan 2016 11:23:01 -0500 Subject: [PATCH 03/11] Split Riemann Elasticsearch out to https://github.com/riemann/riemann-elasticsearch --- bin/riemann-elasticsearch | 91 --------------------------------------- 1 file changed, 91 deletions(-) delete mode 100755 bin/riemann-elasticsearch diff --git a/bin/riemann-elasticsearch b/bin/riemann-elasticsearch deleted file mode 100755 index c7a0c95a..00000000 --- a/bin/riemann-elasticsearch +++ /dev/null @@ -1,91 +0,0 @@ -#!/usr/bin/env ruby - -require File.expand_path('../../lib/riemann/tools', __FILE__) - -class Riemann::Tools::Elasticsearch - include Riemann::Tools - - require 'faraday' - require 'json' - require 'uri' - - opt :read_timeout, 'Faraday read timeout', type: :int, default: 2 - opt :open_timeout, 'Faraday open timeout', type: :int, default: 1 - opt :path_prefix, 'Elasticsearch path prefix for proxied installations e.g. "els" for target http://localhost/els/_cluster/health', default: "/" - opt :es_host, 'Elasticsearch host', default: "localhost" - opt :es_port, 'Elasticsearch port', type: :int, default: 9200 - - - # Handles HTTP connections and GET requests safely - def safe_get(uri) - # Handle connection timeouts - response = nil - begin - connection = Faraday.new(uri) - response = connection.get do |req| - req.options[:timeout] = options[:read_timeout] - req.options[:open_timeout] = options[:open_timeout] - end - rescue => e - report(:host => uri.host, - :service => "elasticsearch health", - :state => "critical", - :description => "HTTP connection error: #{e.class} - #{e.message}" - ) - end - response - end - - def health_url - path_prefix = options[:path_prefix] - path_prefix[0] = '' if path_prefix[0]=='/' - path_prefix[path_prefix.length-1] = '' if path_prefix[path_prefix.length-1]=='/' - "http://#{options[:es_host]}:#{options[:es_port]}#{path_prefix.length>0?'/':''}#{path_prefix}/_cluster/health" - end - - def tick - uri = URI(health_url) - response = safe_get(uri) - - return if response.nil? - - if response.status != 200 - report(:host => uri.host, - :service => "elasticsearch health", - :state => "critical", - :description => "HTTP connection error: #{response.status} - #{response.body}" - ) - else - # Assuming that a 200 will give json - json = JSON.parse(response.body) - cluster_name = json.delete("cluster_name") - cluster_status = json.delete("status") - state = case cluster_status - when "green" - "ok" - when "yellow" - "warning" - when "red" - "critical" - end - - report(:host => uri.host, - :service => "elasticsearch health", - :state => state, - :description => "Elasticsearch cluster: #{cluster_name} - #{cluster_status}") - - json.each_pair do |k,v| - report(:host => uri.host, - :service => "elasticsearch #{k}", - :metric => v, - :description => "Elasticsearch cluster #{k}" - ) - - end - end - end - - - -end -Riemann::Tools::Elasticsearch.run From 76ec19d2854111ad2c9c0d9e10bff628ce09b942 Mon Sep 17 00:00:00 2001 From: James Turnbull Date: Sun, 17 Jan 2016 11:23:31 -0500 Subject: [PATCH 04/11] Split Riemann Resmon out to https://github.com/riemann/riemann-resmon --- bin/riemann-resmon | 103 --------------------------------------------- 1 file changed, 103 deletions(-) delete mode 100755 bin/riemann-resmon diff --git a/bin/riemann-resmon b/bin/riemann-resmon deleted file mode 100755 index 5839d4fe..00000000 --- a/bin/riemann-resmon +++ /dev/null @@ -1,103 +0,0 @@ -#!/usr/bin/env ruby - -require File.expand_path('../../lib/riemann/tools', __FILE__) - -class Riemann::Tools::Resmon - include Riemann::Tools - require 'nokogiri' - require 'faraday' - - opt :resmon_hostfile, 'File with hostnames running Resmon (one URI per line)', type: :string - opt :read_timeout, 'Faraday read timeout', type: :int, default: 2 - opt :open_timeout, 'Faraday open timeout', type: :int, default: 1 - opt :fqdn, 'Use FQDN for event host' - - - def initialize - @hosts = File.read(options[:resmon_hostfile]).split("\n") - super - end - - - # Work out the hostname to submit with the event - def get_event_host(host) - unless options[:fqdn] - return host.split('.')[0] - end - return host - end - - # Handles HTTP connections and GET requests safely - def safe_get(uri, event_host) - # Handle connection timeouts - response = nil - begin - connection = Faraday.new(uri) - response = connection.get do |req| - req.options[:timeout] = options[:read_timeout] - req.options[:open_timeout] = options[:open_timeout] - end - rescue => e - report(:host => event_host, - :service => "resmon", - :state => "critical", - :description => "HTTP connection error: #{e.class} - #{e.message}" - ) - end - response - end - - def tick - @hosts.each do |host| - - uri = URI(host) - event_host = get_event_host(uri.host) - - response = safe_get(uri, event_host) - next if response.nil? - - # Handle non-200 responses - if response.status != 200 - report(:host => event_host, - :service => "resmon", - :state => "critical", - :description => "HTTP connection error: #{response.status} - #{response.body}" - ) - next - else - report(:host => event_host, - :service => "resmon", - :state => "ok", - :description => "Resmon connection ok" - ) - doc = Nokogiri::XML(response.body) - end - - doc.xpath('//ResmonResults/ResmonResult').each do |result| - timestamp = result.xpath('last_update').first.text - result.xpath('metric').each do |metric| - hash = { - host: event_host, - service: "#{result.attributes['module'].value}`#{result.attributes['service'].value}`#{metric.attributes['name'].value}", - time: timestamp.to_i - } - - case metric.attributes['type'].value - when /[iIlL]/ - hash[:metric] = metric.text.to_i - when 'n' - hash[:metric] = metric.text.to_f - when 's' - hash[:description] = metric.text - when '0' - raise 'dunno what 0 is yet' - end - - report(hash) - end - end - end - end -end - -Riemann::Tools::Resmon.run From b93fe900fd41144318c59be5d082bb69cfc2c877 Mon Sep 17 00:00:00 2001 From: James Turnbull Date: Sun, 17 Jan 2016 11:27:12 -0500 Subject: [PATCH 05/11] Split Riemann RabbitMQ out to https://github.com/riemann/riemann-rabbitmq --- bin/riemann-rabbitmq | 267 ------------------------------------------- 1 file changed, 267 deletions(-) delete mode 100755 bin/riemann-rabbitmq diff --git a/bin/riemann-rabbitmq b/bin/riemann-rabbitmq deleted file mode 100755 index 5dc12331..00000000 --- a/bin/riemann-rabbitmq +++ /dev/null @@ -1,267 +0,0 @@ -#!/usr/bin/env ruby - -require File.expand_path('../../lib/riemann/tools', __FILE__) - -class Riemann::Tools::Rabbitmq - include Riemann::Tools - - require 'faraday' - require 'json' - require 'uri' - - - opt :read_timeout, 'Faraday read timeout', type: :int, default: 2 - opt :open_timeout, 'Faraday open timeout', type: :int, default: 1 - - opt :monitor_user, 'RabbitMQ monitoring user', type: :string - opt :monitor_pass, 'RabbitMQ monitoring user password', type: :string - opt :monitor_port, 'RabbitMQ monitoring port', type: :int, default: 15672 - opt :monitor_host, 'RabbitMQ monitoring host', type: :string, default: "localhost" - opt :monitor_use_tls, 'RabbitMQ use tls', type: :bool, default: false - - opt :max_queue_size, "max number of items in a queue that is acceptable", type: :int, default: 1_000_000 - opt :ignore_max_size_queues, "A regular expression to match queues that shouldn't be size-checked", type: :string - - opt :node, "Specify a node to monitor", type: :strings - - def base_url - protocol = "http" - if (options[:monitor_use_tls]) && (options[:monitor_use_tls]==true) - protocol = "https" - end - "#{protocol}://#{options[:monitor_user]}:#{options[:monitor_pass]}@#{options[:monitor_host]}:#{options[:monitor_port]}/api" - end - - def overview_url - "#{base_url}/overview" - end - - def node_url(n) - "#{base_url}/nodes/#{n}" - end - - def queues_url - "#{base_url}/queues" - end - - def event_host - if options[:event_host] - return options[:event_host] - else - return options[:monitor_host] - end - end - - def safe_get(uri, event_host) - # Handle connection timeouts - response = nil - begin - connection = Faraday.new(uri) - response = connection.get do |req| - req.options[:timeout] = options[:read_timeout] - req.options[:open_timeout] = options[:open_timeout] - end - report(:host => event_host, - :service => "rabbitmq monitoring", - :state => 'ok', - :description => "Monitoring operational" - ) - rescue => e - report(:host => event_host, - :service => "rabbitmq monitoring", - :state => "critical", - :description => "HTTP connection error: #{e.class} - #{e.message}" - ) - end - response - end - - def check_queues - response = safe_get(queues_url, event_host) - max_size_check_filter = if options[:ignore_max_size_queues] - Regexp.new(options[:ignore_max_size_queues]) - else - nil - end - - return if response.nil? - - json = JSON.parse(response.body) - - if response.status != 200 - report(:host => event_host, - :service => "rabbitmq.queue", - :state => "critical", - :description => "HTTP connection error to /api/queues: #{response.status} - #{response.body}" - ) - else - report(:host => event_host, - :service => "rabbitmq.queue", - :state => "ok", - :description => "HTTP connection ok" - ) - - json = JSON.parse(response.body) - - json.each do |queue| - svc = "rabbitmq.queue.#{queue['vhost']}.#{queue['name']}" - errs = [] - - if queue['messages_ready']!=nil and queue['messages_ready'] > 0 and queue['consumers'] == 0 - errs << "Queue has jobs but no consumers" - end - - if (max_size_check_filter.nil? or queue['name'] !~ max_size_check_filter) and queue['messages_ready']!=nil and queue['messages_ready'] > options[:max_queue_size] - errs << "Queue has #{queue['messages_ready']} jobs" - end - - if errs.empty? - report(:host => event_host, - :service => svc, - :state => "ok", - :description => "Queue is looking good" - ) - else - report(:host => event_host, - :service => svc, - :state => "critical", - :description => errs.join("; ") - ) - end - - stats = (queue['message_stats'] || {}).merge( - 'messages' => queue['messages'], - 'messages_details' => queue['messages_details'], - 'messages_ready' => queue['messages_ready'], - 'messages_ready_details' => queue['messages_ready_details'], - 'messages_unacknowledged' => queue['messages_unacknowledged'], - 'messages_unacknowledged_details' => queue['messages_unacknowledged_details'], - 'consumers' => queue['consumers'], - 'memory' => queue['memory'], - ) - - stats.each_pair do |k,v| - service = "#{svc}.#{k}" - if k =~ /details$/ and v!=nil - metric = v['rate'] - else - metric = v - end - - # TODO: Set state via thresholds which can be configured - - report(:host => event_host, - :service => service, - :metric => metric, - :description => "RabbitMQ monitor" - ) - end - end - end - end - - def check_overview - uri = URI(overview_url) - response = safe_get(uri, event_host) - - return if response.nil? - - json = JSON.parse(response.body) - - if response.status != 200 - report(:host => event_host, - :service => "rabbitmq", - :state => "critical", - :description => "HTTP connection error: #{response.status} - #{response.body}" - ) - else - report(:host => event_host, - :service => "rabbitmq monitoring", - :state => "ok", - :description => "HTTP connection ok" - ) - - %w( message_stats queue_totals object_totals ).each do |stat| - # NOTE / BUG ? - # Brand new servers can have blank message stats. Is this ok? - # I can't decide. - next if json[stat].empty? - json[stat].each_pair do |k,v| - service = "rabbitmq.#{stat}.#{k}" - if k =~ /details$/ - metric = v['rate'] - else - metric = v - end - - # TODO: Set state via thresholds which can be configured - - report(:host => event_host, - :service => service, - :metric => metric, - :description => "RabbitMQ monitor" - ) - end - end - end - end - - def check_node - opts[:node].each do |n| - uri = URI(node_url(n)) - response = safe_get(uri, event_host) - - return if response.nil? - - if response.status != 200 - if response.status == 404 - report(:host => event_host, - :service => "rabbitmq.node.#{n}", - :state => "critical", - :description => "Node was not found in the cluster" - ) - else - report(:host => event_host, - :service => "rabbitmq.node.#{n}", - :state => "critical", - :description => "HTTP error: #{response.status} - #{response.body}" - ) - end - return - end - - json = JSON.parse(response.body) - - if json['mem_alarm'] - report(:host => event_host, - :service => "rabbitmq.node.#{n}", - :state => "critical", - :description => "Memory alarm has triggered; job submission throttled" - ) - return - end - - if json['disk_free_alarm'] - report(:host => event_host, - :service => "rabbitmq.node.#{n}", - :state => "critical", - :description => "Disk free alarm has triggered; job submission throttled" - ) - return - end - - report(:host => event_host, - :service => "rabbitmq.node.#{n}", - :state => "ok", - :description => "Node looks OK to me" - ) - end - end - - def tick - check_overview - check_node if opts[:node] - check_queues - end -end -Riemann::Tools::Rabbitmq.run From 5b6401306f5660731f1860ef3982595a96c97ff5 Mon Sep 17 00:00:00 2001 From: James Turnbull Date: Sun, 17 Jan 2016 11:29:47 -0500 Subject: [PATCH 06/11] Split Riemann Mesos out to https://github.com/riemann/riemann-mesos --- bin/riemann-mesos | 79 ----------------------------------------------- 1 file changed, 79 deletions(-) delete mode 100755 bin/riemann-mesos diff --git a/bin/riemann-mesos b/bin/riemann-mesos deleted file mode 100755 index 91df6973..00000000 --- a/bin/riemann-mesos +++ /dev/null @@ -1,79 +0,0 @@ -#!/usr/bin/env ruby - -require File.expand_path('../../lib/riemann/tools', __FILE__) - -class Riemann::Tools::Mesos - include Riemann::Tools - - require 'faraday' - require 'json' - require 'uri' - - opt :read_timeout, 'Faraday read timeout', type: :int, default: 2 - opt :open_timeout, 'Faraday open timeout', type: :int, default: 1 - opt :path_prefix, 'Mesos path prefix for proxied installations e.g. "mesos" for target http://localhost/mesos/metrics/snapshot', default: "/" - opt :mesos_host, 'Mesos host', default: "localhost" - opt :mesos_port, 'Mesos port', type: :int, default: 5050 - - # Handles HTTP connections and GET requests safely - def safe_get(uri) - # Handle connection timeouts - response = nil - begin - connection = Faraday.new(uri) - response = connection.get do |req| - req.options[:timeout] = options[:read_timeout] - req.options[:open_timeout] = options[:open_timeout] - end - rescue => e - report(:host => uri.host, - :service => "mesos health", - :state => "critical", - :description => "HTTP connection error: #{e.class} - #{e.message}" - ) - end - response - end - - def health_url - path_prefix = options[:path_prefix] - path_prefix[0] = '' if path_prefix[0]=='/' - path_prefix[path_prefix.length-1] = '' if path_prefix[path_prefix.length-1]=='/' - "http://#{options[:mesos_host]}:#{options[:mesos_port]}#{path_prefix.length>0?'/':''}#{path_prefix}/metrics/snapshot" - end - - def tick - uri = URI(health_url) - response = safe_get(uri) - - return if response.nil? - - if response.status != 200 - report(:host => uri.host, - :service => "mesos health", - :state => "critical", - :description => "HTTP connection error: #{response.status} - #{response.body}" - ) - else - # Assuming that a 200 will give json - json = JSON.parse(response.body) - state = "ok" - - report(:host => uri.host, - :service => "mesos health", - :state => state) - - json.each_pair do |k,v| - report(:host => uri.host, - :service => "mesos #{k}", - :metric => v - ) - - end - end - end - - - -end -Riemann::Tools::Mesos.run From 98874d9f1381657f3234ec47a16da6fbf2e4dc3e Mon Sep 17 00:00:00 2001 From: James Turnbull Date: Sun, 17 Jan 2016 11:34:01 -0500 Subject: [PATCH 07/11] Split Riemann Marathon out to https://github.com/riemann/riemann-marathon --- Rakefile.rb | 1 - bin/riemann-marathon | 96 -------------------------------------------- 2 files changed, 97 deletions(-) delete mode 100755 bin/riemann-marathon diff --git a/Rakefile.rb b/Rakefile.rb index f8bc8724..da758f84 100644 --- a/Rakefile.rb +++ b/Rakefile.rb @@ -26,7 +26,6 @@ s.add_dependency 'munin-ruby', '>= 0.2.1' s.add_dependency 'yajl-ruby', '>= 1.1.0' s.add_dependency 'fog', '>= 1.4.0' - s.add_dependency 'faraday', '>= 0.8.5' s.add_dependency 'nokogiri', '>= 1.5.6' s.files = FileList['lib/**/*', 'bin/*', 'LICENSE', 'README.markdown'].to_a diff --git a/bin/riemann-marathon b/bin/riemann-marathon deleted file mode 100755 index 389c3ffa..00000000 --- a/bin/riemann-marathon +++ /dev/null @@ -1,96 +0,0 @@ -#!/usr/bin/env ruby - -require File.expand_path('../../lib/riemann/tools', __FILE__) - -class Riemann::Tools::Marathon - include Riemann::Tools - - require 'faraday' - require 'json' - require 'uri' - - opt :read_timeout, 'Faraday read timeout', type: :int, default: 2 - opt :open_timeout, 'Faraday open timeout', type: :int, default: 1 - opt :path_prefix, 'Marathon path prefix for proxied installations e.g. "marathon" for target http://localhost/marathon/metrics', default: "/" - opt :marathon_host, 'Marathon host', default: "localhost" - opt :marathon_port, 'Marathon port', type: :int, default: 8080 - - def initialize - options[:interval] = 60 - options[:ttl] = 120 - end - - # Handles HTTP connections and GET requests safely - def safe_get(uri) - # Handle connection timeouts - response = nil - begin - connection = Faraday.new(uri) - response = connection.get do |req| - req.options[:timeout] = options[:read_timeout] - req.options[:open_timeout] = options[:open_timeout] - end - rescue => e - report(:host => uri.host, - :service => "marathon health", - :state => "critical", - :description => "HTTP connection error: #{e.class} - #{e.message}" - ) - end - response - end - - def health_url - path_prefix = options[:path_prefix] - path_prefix[0] = '' if path_prefix[0]=='/' - path_prefix[path_prefix.length-1] = '' if path_prefix[path_prefix.length-1]=='/' - "http://#{options[:marathon_host]}:#{options[:marathon_port]}#{path_prefix.length>0?'/':''}#{path_prefix}/metrics" - end - - def tick - uri = URI(health_url) - response = safe_get(uri) - - return if response.nil? - - if response.status != 200 - report(:host => uri.host, - :service => "marathon health", - :state => "critical", - :description => "HTTP connection error: #{response.status} - #{response.body}" - ) - else - # Assuming that a 200 will give json - json = JSON.parse(response.body) - state = "ok" - - report(:host => uri.host, - :service => "marathon health", - :state => state) - - json.each_pair do |t, d| - if d.respond_to? :each_pair - d.each_pair do |service, counters| - report(:host => uri.host, - :service => "marathon_metric #{t} #{service}", - :metric => 1, - :tags => ["metric_name"] - ) - if counters.respond_to? :each_pair - counters.each_pair do |k, v| - if v.is_a? Numeric - report(:host => uri.host, - :service => "marathon #{service} #{k}", - :metric => v, - :tags => ["metric", "#{t}"] - ) - end - end - end - end - end - end - end - end -end -Riemann::Tools::Marathon.run From 3359a5e0d1843929adeb3f82b036a0b19a8b85fb Mon Sep 17 00:00:00 2001 From: James Turnbull Date: Sun, 17 Jan 2016 11:39:43 -0500 Subject: [PATCH 08/11] Split Riemann Munin out to https://github.com/riemann/riemann-munin --- Rakefile.rb | 1 - bin/riemann-munin | 36 ------------------------------------ 2 files changed, 37 deletions(-) delete mode 100755 bin/riemann-munin diff --git a/Rakefile.rb b/Rakefile.rb index da758f84..34d815f5 100644 --- a/Rakefile.rb +++ b/Rakefile.rb @@ -23,7 +23,6 @@ s.add_dependency 'riemann-client', '>= 0.2.2' s.add_dependency 'trollop', '>= 1.16.2' - s.add_dependency 'munin-ruby', '>= 0.2.1' s.add_dependency 'yajl-ruby', '>= 1.1.0' s.add_dependency 'fog', '>= 1.4.0' s.add_dependency 'nokogiri', '>= 1.5.6' diff --git a/bin/riemann-munin b/bin/riemann-munin deleted file mode 100755 index bbc8ea27..00000000 --- a/bin/riemann-munin +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env ruby - -# Gathers munin statistics and submits them to Riemann. - -require File.expand_path('../../lib/riemann/tools', __FILE__) - -class Riemann::Tools::Munin - include Riemann::Tools - require 'munin-ruby' - - def initialize - @munin = ::Munin::Node.new - end - - def tick - services = opts[:services] || @munin.list - services.each do |service| - @munin.fetch(service).each do |service, parts| - parts.each do |part, metric| - report( - :service => "#{service} #{part}", - :metric => metric.to_f, - :state => 'ok', - :tags => ['munin'] - ) - end - end - end - end - - opt :munin_host, "Munin hostname", :default => 'localhost' - opt :munin_port, "Munin port", :default => 4949 - opt :services, "Munin services to translate (if not specified, all services are relayed)", :type => :strings -end - -Riemann::Tools::Munin.run From e52b85c1134314b90ba1f839eb886aa9079d0014 Mon Sep 17 00:00:00 2001 From: James Turnbull Date: Sun, 17 Jan 2016 11:51:53 -0500 Subject: [PATCH 09/11] Split Riemann AWS out to https://github.com/riemann/riemann-aws --- Rakefile.rb | 1 - bin/riemann-aws-billing | 79 ------------------ bin/riemann-aws-rds-status | 48 ----------- bin/riemann-aws-sqs-status | 37 --------- bin/riemann-aws-status | 64 --------------- bin/riemann-elb-metrics | 161 ------------------------------------- 6 files changed, 390 deletions(-) delete mode 100755 bin/riemann-aws-billing delete mode 100755 bin/riemann-aws-rds-status delete mode 100755 bin/riemann-aws-sqs-status delete mode 100755 bin/riemann-aws-status delete mode 100755 bin/riemann-elb-metrics diff --git a/Rakefile.rb b/Rakefile.rb index 34d815f5..4c739ee4 100644 --- a/Rakefile.rb +++ b/Rakefile.rb @@ -24,7 +24,6 @@ s.add_dependency 'riemann-client', '>= 0.2.2' s.add_dependency 'trollop', '>= 1.16.2' s.add_dependency 'yajl-ruby', '>= 1.1.0' - s.add_dependency 'fog', '>= 1.4.0' s.add_dependency 'nokogiri', '>= 1.5.6' s.files = FileList['lib/**/*', 'bin/*', 'LICENSE', 'README.markdown'].to_a diff --git a/bin/riemann-aws-billing b/bin/riemann-aws-billing deleted file mode 100755 index a80045a4..00000000 --- a/bin/riemann-aws-billing +++ /dev/null @@ -1,79 +0,0 @@ -#!/usr/bin/env ruby -require 'fog' - - -require File.expand_path('../../lib/riemann/tools', __FILE__) - -$0 = __FILE__ - -class Riemann::Tools::AWSBilling - include Riemann::Tools - - opt :fog_credentials_file, "Fog credentials file", :type => String - opt :fog_credential, "Fog credentials to use", :type => String - - opt :access_key, "AWS access key", :type => String - opt :secret_key, "Secret access key", :type => String - opt :services, "AWS services: AmazonEC2 AmazonS3 AWSDataTransfer", :type => :strings, :multi => true, :default => ["AmazonEC2", "AmazonS3", "AWSDataTransfer"] - - opt :time_start, "Start time in seconds of the metrics period (2hrs ago default)", :type => Integer, :default => 7200 - opt :time_end, "End time in seconds of the metrics period ", :type => Integer, :default => 60 - - - def initialize - if options[:fog_credentials_file] - Fog.credentials_path = opts[:fog_credentials_file] - Fog.credential = opts[:fog_credential].to_sym - @cloudwatch = Fog::AWS::CloudWatch.new - else - @cloudwatch = Fog::AWS::CloudWatch.new(:aws_secret_access_key => opts[:secret_key], :aws_access_key_id => opts[:access_key]) - @start_time = (Time.now.utc - opts[:time_start]).iso8601 - @end_time = (Time.now.utc - opts[:time_end]).iso8601 - end - end - - def tick - opts[:services].each do |service| - data = @cloudwatch.get_metric_statistics({ - 'Statistics' => ["Maximum"], - 'StartTime' => @start_time, - 'EndTime' => @end_time, - 'Period' => 3600, - 'Unit' => "None", - 'MetricName' => "EstimatedCharges", - 'Namespace' => "AWS/Billing", - 'Dimensions' => [ - { - 'Name' => "ServiceName", - 'Value' => service - }, - { - 'Name' => "Currency", - 'Value' => "USD" - } - ] - }).body['GetMetricStatisticsResult']['Datapoints'] - - - data.each do |metrics| - name = "AWScloudwatch.Billing." + service - value = metrics["Maximum"] - timestamp = metrics["Timestamp"].to_i - - event = { - host: nil, - service: name, - time: timestamp, - description: "AWS Estimate Charges for #{service}", - tags: ["aws_billing"], - state: "ok", - metric: value - } - - report event - end - end - end -end - -Riemann::Tools::AWSBilling.run diff --git a/bin/riemann-aws-rds-status b/bin/riemann-aws-rds-status deleted file mode 100755 index 440c8d64..00000000 --- a/bin/riemann-aws-rds-status +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env ruby -require 'rubygems' -require 'fog' -require 'date' -require 'time' -require 'json' - -require File.expand_path('../../lib/riemann/tools', __FILE__) - -$0 = __FILE__ # Let's not expose our AWS keys in the process list - -class Riemann::Tools::AWS - include Riemann::Tools - - opt :access_key, "AWS access key", :type => String - opt :secret_key, "Secret access key", :type => String - opt :region, "AWS region", :type => String, :default => 'eu-west-1' - opt :dbinstance_identifier, "DBInstanceIdentifier", :type => String - def initialize - abort "FATAL: specify a DB instance name, see --help for usage" unless opts[:dbinstance_identifier] - @cloudwatch = Fog::AWS::CloudWatch.new(:aws_access_key_id => opts[:access_key], - :aws_secret_access_key => opts[:secret_key], - :region => opts[:region]) - end - - def tick - time = Time.new - ['DatabaseConnections', 'FreeableMemory', 'FreeStorageSpace', 'NetworkReceiveThroughput', 'NetworkTransmitThroughput', 'ReadThroughput', 'CPUUtilization'].each do |metric| - result = @cloudwatch.get_metric_statistics({"Namespace" => 'AWS/RDS', "MetricName" => "#{metric}", "Statistics" => 'Average', "Dimensions" => [{"Name" => "DBInstanceIdentifier", "Value" => "#{opts[:dbinstance_identifier]}"}], "StartTime" => (time-120).to_time.iso8601, "EndTime" => time.to_time.iso8601, "Period" => 60}) - metricsResult = result.data[:body]['GetMetricStatisticsResult'] - puts JSON.dump(metricsResult) - if (metricsResult['Datapoints'].length>0) - datapoint = metricsResult['Datapoints'][0] - ev = {:metric => datapoint['Average'], - :service => "#{opts[:dbinstance_identifier]}.#{metric} (#{datapoint['Unit']})", - :description => JSON.dump(metricsResult), - :state => "ok", - :ttl => 300} - - - report ev - end - - end - end -end - -Riemann::Tools::AWS.run diff --git a/bin/riemann-aws-sqs-status b/bin/riemann-aws-sqs-status deleted file mode 100755 index f19bc965..00000000 --- a/bin/riemann-aws-sqs-status +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env ruby -require 'rubygems' -require 'fog' - -require File.expand_path('../../lib/riemann/tools', __FILE__) - -$0 = __FILE__ # Let's not expose our AWS keys in the process list - -class Riemann::Tools::AWS - include Riemann::Tools - - opt :access_key, "AWS access key", :type => String - opt :secret_key, "Secret access key", :type => String - opt :region, "AWS region", :type => String, :default => 'us-east-1' - opt :queue, "SQS Queue name", :type => String - def initialize - @sqs = Fog::AWS::SQS.new(:aws_access_key_id => opts[:access_key], - :aws_secret_access_key => opts[:secret_key], - :region => opts[:region]) - response = @sqs.list_queues({'QueueNamePrefix' => opts[:queue]}) - @queue_url = response[:body]['QueueUrls'].first - end - - def tick - response = @sqs.get_queue_attributes(@queue_url, 'All') - ['ApproximateNumberOfMessages', 'ApproximateNumberOfMessagesNotVisible'].each do |attr| - msg = { - metric: response[:body]['Attributes'][attr], - service: "#{opts[:queue]} #{attr}", - state: 'ok' - } - report msg - end - end -end - -Riemann::Tools::AWS.run diff --git a/bin/riemann-aws-status b/bin/riemann-aws-status deleted file mode 100755 index 119f402c..00000000 --- a/bin/riemann-aws-status +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env ruby -require 'rubygems' -require 'fog' -require 'date' - -require File.expand_path('../../lib/riemann/tools', __FILE__) - -$0 = __FILE__ # Let's not expose our AWS keys in the process list - -class Riemann::Tools::AWS - include Riemann::Tools - - opt :access_key, "AWS access key", :type => String - opt :secret_key, "Secret access key", :type => String - opt :region, "AWS region", :type => String, :default => 'eu-west-1' - - opt :retirement_critical, "Number of days before retirement. Defaults to 2", :default => 2 - opt :event_warning, "Number of days before event. Defaults to nil (i.e. when the event appears)", :default => nil - - def initialize - @compute = Fog::Compute.new(:aws_access_key_id => opts[:access_key], - :aws_secret_access_key => opts[:secret_key], - :region => opts[:region], - :provider => 'AWS') - end - - def tick - instance_status = @compute.describe_instance_status.body["instanceStatusSet"] - status = instance_status.inject({}) do |acc,i| - acc[i.delete("instanceId")] = i - acc - end - - hosts = @compute.servers.select { |s| s.state == "running" }. - inject([status, {}]) do |(status, acc), host| - acc[host.private_dns_name] = status.delete(host.id); [status, acc] - end[1] - - hosts.each do |host, status| - status['eventsSet'].each do |event| - before, after = ['notBefore', 'notAfter'].map { |k| Date.parse event[k].to_s if event[k] } - - ev = {:host => host, - :service => "aws_instance_status", - :description => "#{event['code']}\n\nstart #{event['notBefore']}\nend #{event['notAfter']}\n\n#{event['description']}", - :state => "ok", - :ttl => 300} - - ev2 = if (event['code'] == 'instance-retirement') and - Date.today >= before-opts[:retirement_critical] - {:state => "critical"} - elsif opts[:event_warning] and Date.today >= before-opts[:event_warning] - {:state => "warning"} - else - {:state => "warning"} - end - - report ev.merge(ev2) - end - end - end -end - -Riemann::Tools::AWS.run diff --git a/bin/riemann-elb-metrics b/bin/riemann-elb-metrics deleted file mode 100755 index ab6ac483..00000000 --- a/bin/riemann-elb-metrics +++ /dev/null @@ -1,161 +0,0 @@ -#!/usr/bin/env ruby - -require File.expand_path('../../lib/riemann/tools', __FILE__) - -$0 = __FILE__ - -class Riemann::Tools::ELBMetrics - include Riemann::Tools - - require 'fog' - require 'time' - - opt :fog_credentials_file, "Fog credentials file", :type => String - opt :fog_credential, "Fog credentials to use", :type => String - opt :aws_access, "AWS Access Key", :type => String - opt :aws_secret, "AWS Secret Key", :type => String - opt :aws_region, "AWS Region", :type => String, :default => "eu-west-1" - opt :aws_azs, "List of AZs to aggregate against", :type => :strings, :default => [ "all_az" ] - opt :elbs, "List of ELBs to pull metrics from", :type => :strings, :required => true - - def standard_metrics - # ELB metric types, from: - # http://docs.aws.amazon.com/AmazonCloudWatch/latest/DeveloperGuide/CW_Support_For_AWS.html#elb-metricscollected - metric_options = { - "Latency" => { - "Unit" => "Seconds", - "Statistics" => ["Maximum", "Minimum", "Average" ] - }, - "RequestCount" => { - "Unit" => "Count", - "Statistics" => [ "Sum" ] - }, - "HealthyHostCount" => { - "Units" => "Count", - "Statistics" => [ "Minimum", "Maximum", "Average" ] - }, - "UnHealthyHostCount" => { - "Units" => "Count", - "Statistics" => [ "Minimum", "Maximum", "Average" ] - }, - "HTTPCode_ELB_4XX" => { - "Units" => "Count", - "Statistics" => [ "Sum" ] - }, - "HTTPCode_ELB_5XX" => { - "Units" => "Count", - "Statistics" => [ "Sum" ] - }, - "HTTPCode_Backend_2XX" => { - "Units" => "Count", - "Statistics" => [ "Sum" ] - }, - "HTTPCode_Backend_3XX" => { - "Units" => "Count", - "Statistics" => [ "Sum" ] - }, - "HTTPCode_Backend_4XX" => { - "Units" => "Count", - "Statistics" => [ "Sum" ] - }, - "HTTPCode_Backend_5XX" => { - "Units" => "Count", - "Statistics" => [ "Sum" ] - } - } - - metric_options - end - - def base_metrics - # get last 60 seconds - start_time = (Time.now.utc - 60).iso8601 - end_time = Time.now.utc.iso8601 - - # The base query that all metrics would get - metric_base = { - "Namespace" => "AWS/ELB", - "StartTime" => start_time, - "EndTime" => end_time, - "Period" => 60, - } - - metric_base - end - - - def tick - if options[:fog_credentials_file] - Fog.credentials_path = options[:fog_credentials_file] - Fog.credential = options[:fog_credential].to_sym - connection = Fog::AWS::CloudWatch.new - else - if options[:aws_access] && options[:aws_secret] - connection = Fog::AWS::CloudWatch.new({ - :aws_access_key_id => options[:aws_access], - :aws_secret_access_key => options[:aws_secret], - :region => options[:aws_region] - }) - else - connection = Fog::AWS::CloudWatch.new({ - :use_iam_profile => true, - :region => options[:aws_region] - }) - end - end - - options[:elbs].each do |lb| - - metric_options = standard_metrics - metric_base_options = base_metrics - - options[:aws_azs].each do |az| - metric_options.keys.sort.each do |metric_type| - merged_options = metric_base_options.merge(metric_options[metric_type]) - merged_options["MetricName"] = metric_type - if az == "all_az" - merged_options["Dimensions"] = [ { "Name" => "LoadBalancerName", "Value" => lb } ] - else - merged_options["Dimensions"] = [ - { "Name" => "LoadBalancerName", "Value" => lb }, - { "Name" => "AvailabilityZone" , "Value" => az} - ] - end - - result = connection.get_metric_statistics(merged_options) - - # "If no response codes in the category 2XX-5XX range are sent to clients within - # the given time period, values for these metrics will not be recorded in CloudWatch" - #next if result.body["GetMetricStatisticsResult"]["Datapoints"].empty? && metric_type =~ /[2345]XX/ - # - # BUG: - # Metrics are reported every 60 seconds, but sometimes there isn't one there yet. - # We can skip that, or do something else? - next if result.body["GetMetricStatisticsResult"]["Datapoints"].empty? - - # We should only ever have a single data point - result.body["GetMetricStatisticsResult"]["Datapoints"][0].keys.sort.each do |stat_type| - next if stat_type == "Unit" - next if stat_type == "Timestamp" - - unit = result.body["GetMetricStatisticsResult"]["Datapoints"][0]["Unit"] - metric = result.body["GetMetricStatisticsResult"]["Datapoints"][0][stat_type] - event = Hash.new - event = { - host: lb, - service: "elb.#{az}.#{metric_type}.#{stat_type}", - ttl: 60, - description: "#{lb} #{metric_type} #{stat_type} (#{unit})", - tags: [ "production", "elb_metrics" ], - metric: metric - } - - report(event) - end - end - end - end - end -end - -Riemann::Tools::ELBMetrics.run From 91a29b36a45e401a6eeec95c081615281bb6bc05 Mon Sep 17 00:00:00 2001 From: James Turnbull Date: Sun, 17 Jan 2016 12:43:29 -0500 Subject: [PATCH 10/11] Split Riemann Riak out to https://github.com/riemann/riemann-riak --- Rakefile.rb | 1 - bin/riemann-riak | 331 ------------------------------------- bin/riemann-riak-keys | 12 -- bin/riemann-riak-ring | 8 - riak_status/key_count.erl | 12 -- riak_status/riak_status.rb | 151 ----------------- riak_status/ringready.erl | 8 - 7 files changed, 523 deletions(-) delete mode 100755 bin/riemann-riak delete mode 100755 bin/riemann-riak-keys delete mode 100755 bin/riemann-riak-ring delete mode 100755 riak_status/key_count.erl delete mode 100755 riak_status/riak_status.rb delete mode 100755 riak_status/ringready.erl diff --git a/Rakefile.rb b/Rakefile.rb index 4c739ee4..df1658bb 100644 --- a/Rakefile.rb +++ b/Rakefile.rb @@ -23,7 +23,6 @@ s.add_dependency 'riemann-client', '>= 0.2.2' s.add_dependency 'trollop', '>= 1.16.2' - s.add_dependency 'yajl-ruby', '>= 1.1.0' s.add_dependency 'nokogiri', '>= 1.5.6' s.files = FileList['lib/**/*', 'bin/*', 'LICENSE', 'README.markdown'].to_a diff --git a/bin/riemann-riak b/bin/riemann-riak deleted file mode 100755 index 7688ae7a..00000000 --- a/bin/riemann-riak +++ /dev/null @@ -1,331 +0,0 @@ -#!/usr/bin/env ruby - -# Forwards information on a Riak node to Riemann. - -require File.expand_path('../../lib/riemann/tools', __FILE__) - -require 'net/http' -require 'net/https' -require 'yajl/json_gem' - -class Riemann::Tools::Riak - include Riemann::Tools - - opt :riak_host, "Riak host for stats or SSL http(s)://", :default => Socket.gethostname - opt :data_dir, "Riak data directory", :default => '/var/lib/riak' - opt :stats_port, "Riak HTTP port for stats", :default => 8098 - opt :stats_path, "Riak HTTP stats path", :default => '/stats' - opt :node_name, "Riak erlang node name", :default => "riak@#{Socket.gethostname}" - opt :cookie, "Riak cookie to use", :default => "riak" - - opt :get_50_warning, "FSM 50% get time warning threshold (ms)", :default => 1000 - opt :put_50_warning, "FSM 50% put time warning threshold (ms)", :default => 1000 - opt :get_95_warning, "FSM 95% get time warning threshold (ms)", :default => 2000 - opt :put_95_warning, "FSM 95% put time warning threshold (ms)", :default => 2000 - opt :get_99_warning, "FSM 99% get time warning threshold (ms)", :default => 10000 - opt :put_99_warning, "FSM 99% put time warning threshold (ms)", :default => 10000 - - def initialize - detect_features - - @httpstatus = true - - begin - uri = URI.parse(opts[:riak_host]) - if uri.host == nil - uri.host = opts[:riak_host] - end - http = Net::HTTP.new(uri.host, opts[:stats_port]) - http.use_ssl = uri.scheme == 'https' - if http.use_ssl? - http.verify_mode = OpenSSL::SSL::VERIFY_NONE - end - http.start do |h| - h.get opts[:stats_path] - end - rescue => _e - @httpstatus = false - end - - # we're going to override the emulator setting to allow users to - # dynamically input the cookie - # this is done only once - hopefully it doesn't get overridden. - ENV['ERL_AFLAGS'] = "-setcookie #{opts[:cookie]}" - end - - # Identifies whether escript and riak-admin are installed - def detect_features - @escript = true # Whether escript is present on this machine - @riakadmin = true # Whether riak-admin is present - - if `which escript` =~ /^\s*$/ - @escript = false - end - - if `which riak-admin` =~ /^\s*$/ - @riakadmin = false - end - end - - def check_ring - str = if @escript - str = `#{File.expand_path(File.dirname(__FILE__))}/riemann-riak-ring #{opts[:node_name]}`.chomp - elsif @riakadmin - str = `riak-admin ringready` - else - nil - end - - return if str.nil? - - if str =~ /^TRUE/ - report( - :host => opts[:riak_host], - :service => 'riak ring', - :state => 'ok', - :description => str - ) - else - report( - :host => opts[:riak_host], - :service => 'riak ring', - :state => 'warning', - :description => str - ) - end - end - - def check_keys - keys = `#{File.expand_path(File.dirname(__FILE__))}/riemann-riak-keys #{opts[:node_name]}`.chomp - if keys =~ /^\d+$/ - report( - :host => opts[:riak_host], - :service => 'riak keys', - :state => 'ok', - :metric => keys.to_i, - :description => keys - ) - else - report( - :host => opts[:riak_host], - :service => 'riak keys', - :state => 'unknown', - :description => keys - ) - end - end - - def check_transfers - str = if @riakadmin - `riak-admin transfers` - else - nil - end - - return if str.nil? - - if str =~ /'#{opts[:node_name]}' waiting to handoff (\d+) partitions/ - report( - :host => opts[:riak_host], - :service => 'riak transfers', - :state => 'critical', - :metric => $1.to_i, - :description => "waiting to handoff #{$1} partitions" - ) - else - report( - :host => opts[:riak_host], - :service => 'riak transfers', - :state => 'ok', - :metric => 0, - :description => "No pending transfers" - ) - end - end - - def check_disk - gb = `du -Ls #{opts[:data_dir]}`.split(/\s+/).first.to_i / (1024.0**2) - report( - :host => opts[:riak_host], - :service => 'riak disk', - :state => 'ok', - :metric => gb, - :description => "#{gb} GB in #{opts[:data_dir]}" - ) - end - - # Returns the riak stat for the given fsm type and percentile. - def fsm_stat(type, property, percentile) - "node_#{type}_fsm_#{property}_#{percentile == 50 ? 'median' : percentile}" - end - - # Returns the alerts state for the given fsm. - def fsm_state(type, percentile, val) - limit = opts["#{type}_#{percentile}_warning".to_sym] - case val - when 0 .. limit - 'ok' - when limit .. limit * 2 - 'warning' - else - 'critical' - end - end - - # Get current stats via HTTP - def stats_http - begin - uri = URI.parse(opts[:riak_host]) - if uri.host == nil - uri.host = opts[:riak_host] - end - http = Net::HTTP.new(uri.host, opts[:stats_port]) - http.use_ssl = uri.scheme == 'https' - if http.use_ssl? - http.verify_mode = OpenSSL::SSL::VERIFY_NONE - end - res = http.start do |h| - h.get opts[:stats_path] - end - rescue => e - report( - :host => opts[:riak_host], - :service => 'riak', - :state => 'critical', - :description => "error fetching #{opts[:riak_host]}:#{opts[:stats_port]} #{e.class}, #{e.message}" - ) - raise - end - - if res.code.to_i == 200 - return JSON.parse(res.body) - else - report( - :host => opts[:riak_host], - :service => 'riak', - :state => 'critical', - :description => "stats returned HTTP #{res.code}:\n\n#{res.body}" - ) - raise "Can't fetch stats via HTTP: #{res.core}:\n\n#{res.body}" - end - end - - # Get current stats via riak-admin - def stats_riak_admin - str = `riak-admin status` - raise "riak-admin failed" unless $? == 0 - Hash[str.split(/\n/).map{|i| i.split(/ : /)}] - end - - # Get current stats as a hash - def stats - if @httpstatus - stats_http - elsif @riakadmin - stats_riak_admin - else - report( - :host => opts[:riak_host], - :service => 'riak', - :state => 'critical', - :description => "No mechanism for fetching Riak stats: neither HTTP nor riak-admin available." - ) - raise "No mechanism for fetching Riak stats: neither HTTP nor riak-admin available." - end - end - - def core_services - ['vnode_gets', - 'vnode_puts', - 'node_gets', - 'node_puts', - 'node_gets_set', - 'node_puts_set', - 'read_repairs'] - end - - def fsm_types - [{'get' => 'time'}, {'put' => 'time'}, - {'get' => 'set_objsize'}] - end - - def fsm_percentiles - [50, 95, 99] - end - - # Reports current stats to Riemann - def check_stats - begin - stats = self.stats - rescue => e - event = {:state => 'critical', - :description => e.message, - :host => opts[:riak_host]} - # Report errors - report(event.merge(:service => 'riak')) - core_services.each do |s| - report(event.merge(:service => "riak #{s}")) - end - fsm_types.each do |typespec| - typespec.each do |type, prop| - fsm_percentiles.each do |percentile| - report(event.merge(:service => "riak #{type} #{prop} #{percentile}")) - end - end - end - return - end - - # Riak itself - report( - :host => opts[:riak_host], - :service => 'riak', - :state => 'ok' - ) - - # Gets/puts/rr - core_services.each do |s| - report( - :host => opts[:riak_host], - :service => "riak #{s}", - :state => 'ok', - :metric => stats[s].to_i/60.0, - :description => "#{stats[s].to_i/60.0}/sec" - ) - end - - # FSMs - fsm_types.each do |typespec| - typespec.each do |type, prop| - fsm_percentiles.each do |percentile| - val = stats[fsm_stat(type, prop, percentile)].to_i || 0 - val = 0 if val == 'undefined' - val /= 1000.0 if prop == 'time' # Convert us to ms - if prop == 'time' - state = fsm_state(type, percentile, val) - else - state = "ok" - end - report( - :host => opts[:riak_host], - :service => "riak #{type} #{prop} #{percentile}", - :state => state, - :metric => val, - :description => "#{val} ms" - ) - end - end - end - end - - def tick - # This can utterly destroy a cluster, so we disable - # check_keys - check_stats - check_ring - check_disk - check_transfers - end -end - -Riemann::Tools::Riak.run diff --git a/bin/riemann-riak-keys b/bin/riemann-riak-keys deleted file mode 100755 index 52f690f3..00000000 --- a/bin/riemann-riak-keys +++ /dev/null @@ -1,12 +0,0 @@ -#!/usr/bin/env escript -%%! -name riakstatuscheck@127.0.0.1 -hidden - -main([]) -> main(["riak@127.0.0.1"]); -main([Node]) -> - io:format("~w\n", [ - lists:foldl( - fun({_VNode, Count}, Sum) -> Sum + Count end, - 0, - rpc:call(list_to_atom(Node), riak_kv_bitcask_backend, key_counts, []) - ) - ]). diff --git a/bin/riemann-riak-ring b/bin/riemann-riak-ring deleted file mode 100755 index f87fe3d3..00000000 --- a/bin/riemann-riak-ring +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/env escript -%%! -name riakstatuscheck@127.0.0.1 -hidden - -main([]) -> main(["riak@127.0.0.1"]); -main([Node]) -> - io:format("~p\n", [ - rpc:call(list_to_atom(Node), riak_kv_console, ringready, [[]]) - ]). diff --git a/riak_status/key_count.erl b/riak_status/key_count.erl deleted file mode 100755 index 65512ba4..00000000 --- a/riak_status/key_count.erl +++ /dev/null @@ -1,12 +0,0 @@ -#!/usr/bin/env escript -%%! -name riakstatuscheck -setcookie riak -hidden - -main([]) -> main(["riak@127.0.0.1"]); -main([Node]) -> - io:format("~w\n", [ - lists:foldl( - fun({_VNode, Count}, Sum) -> Sum + Count end, - 0, - rpc:call(list_to_atom(Node), riak_kv_bitcask_backend, key_counts, []) - ) - ]). diff --git a/riak_status/riak_status.rb b/riak_status/riak_status.rb deleted file mode 100755 index 6efe7ee1..00000000 --- a/riak_status/riak_status.rb +++ /dev/null @@ -1,151 +0,0 @@ -#!/usr/bin/env ruby - -$LOAD_PATH.unshift File.expand_path("#{File.dirname(__FILE__)}/../vodpod-common/lib") -require 'rubygems' -require 'vodpod-common' -require 'vodpod/alerts' -require 'vodpod/starling' -require 'net/http' -require 'yajl/json_gem' - -class RiakStatus - PORT = 8098 - PATH = '/stats' - INTERVAL = 10 - - FSM_LIMITS = { - :get => { - 50 => 1000, - 95 => 2000, - 99 => 10000 - }, - :put => { - 50 => 1000, - 95 => 2000, - 99 => 10000 - } - } - - def initialize(opts = {}) - @host = opts[:host] || `hostname`.chomp - @port = opts[:port] || PORT - @path = opts[:path] || PATH - end - - def alert(subservice, state, metric, description) - Vodpod.alert( - :service => "riak #{subservice}", - :state => state, - :metric => metric, - :description => description - ) - end - - def check_ring - str = `#{File.expand_path(File.dirname(__FILE__))}/ringready.erl riak@#{`hostname`}`.chomp - if str =~ /^TRUE/ - alert 'ring', :ok, nil, str - else - alert 'ring', :warning, nil, str - end - end - - def check_keys - keys = `#{File.expand_path(File.dirname(__FILE__))}/key_count.erl riak@#{`hostname`}`.chomp - if keys =~ /^\d+$/ - alert 'keys', :ok, keys.to_i, keys - else - alert 'keys', :error, nil, keys - end - end - - def check_disk - gb = `du -s /var/lib/riak/bitcask/`.split(/\s+/).first.to_i / (1024.0**2) - alert 'disk', :ok, gb, "#{gb} GB in bitcask" - end - - # Returns the riak stat for the given fsm type and percentile. - def fsm_stat(type, percentile) - "node_#{type}_fsm_time_#{percentile == 50 ? 'median' : percentile}" - end - - # Returns the alerts state for the given fsm. - def fsm_state(type, percentile, val) - limit = FSM_LIMITS[type][percentile] - case val - when 0 .. limit - :ok - when limit .. limit * 2 - :warning - else - :critical - end - end - - def check_stats - begin - res = Net::HTTP.start(@host, @port) do |http| - http.get('/stats') - end - rescue => e - Vodpod.alert( - :service => 'riak', - :state => :critical, - :description => "error fetching /stats: #{e.class}, #{e.message}" - ) - return - end - - if res.code.to_i == 200 - stats = JSON.parse(res.body) - else - Vodpod.alert( - :service => 'riak', - :state => :critical, - :description => "stats returned HTTP #{res.code}:\n\n#{res.body}" - ) - return - end - - Vodpod.alert( - :service => 'riak', - :state => :ok - ) - - # Gets/puts/rr - [ - 'vnode_gets', - 'vnode_puts', - 'node_gets', - 'node_puts', - 'read_repairs' - ].each do |s| - alert s, :ok, stats[s]/60.0, "#{stats[s]/60.0}/sec" - end - - # FSMs - [:get, :put].each do |type| - [50, 95, 99].each do |percentile| - val = stats[fsm_stat(type, percentile)] || 0 - val = 0 if val == 'undefined' - val /= 1000.0 # Convert us to ms - state = fsm_state(type, percentile, val) - alert "#{type} #{percentile}", state, val, "#{val} ms" - end - end - end - - def run - loop do -# check_keys - check_stats - check_ring - check_disk - sleep INTERVAL - end - end -end - -if $0 == __FILE__ - RiakStatus.new.run -end diff --git a/riak_status/ringready.erl b/riak_status/ringready.erl deleted file mode 100755 index 4dfda0dd..00000000 --- a/riak_status/ringready.erl +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/env escript -%%! -name riakstatuscheck -setcookie riak -hidden - -main([]) -> main(["riak@127.0.0.1"]); -main([Node]) -> - io:format("~p\n", [ - rpc:call(list_to_atom(Node), riak_kv_console, ringready, [[]]) - ]). From d5507d43bf9135c2eb66e286064ed0c50e3d3f05 Mon Sep 17 00:00:00 2001 From: James Turnbull Date: Wed, 20 Jan 2016 15:00:21 -0500 Subject: [PATCH 11/11] Fixed dependencies in Rakefile --- Rakefile.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Rakefile.rb b/Rakefile.rb index df1658bb..ddf159c4 100644 --- a/Rakefile.rb +++ b/Rakefile.rb @@ -23,7 +23,7 @@ s.add_dependency 'riemann-client', '>= 0.2.2' s.add_dependency 'trollop', '>= 1.16.2' - s.add_dependency 'nokogiri', '>= 1.5.6' + s.add_dependency 'json' s.files = FileList['lib/**/*', 'bin/*', 'LICENSE', 'README.markdown'].to_a s.executables |= Dir.entries('bin/')