Skip to content

Commit

Permalink
Merge pull request #286 from hathitrust/DEV-573-bugfix
Browse files Browse the repository at this point in the history
DEV-573: Fix broken phase lookup for Shared Print reporting purposes
  • Loading branch information
mwarin authored Oct 12, 2023
2 parents 710e98b + 39ccbda commit 65963cc
Show file tree
Hide file tree
Showing 12 changed files with 477 additions and 108 deletions.
22 changes: 16 additions & 6 deletions bin/inspect_ocn.rb
Original file line number Diff line number Diff line change
@@ -1,23 +1,33 @@
# frozen_string_literal: true

# For manual/ocular inspection of a single OCN.
# Takes an OCN and outputs a pretty-printed matching cluster.
# For manual/ocular inspection of clusters based on OCN(s).
# Takes 1+ OCN(s) commandline args and outputs pretty-printed array
# of matching clusters.
# Usage:
# $ bundle exec ruby bin/inspect_ocn.rb req:ocn_1 (... opt:ocn_n)

require "cluster"
require "services"
require "json"

def main
Services.mongo!
ocn = ARGV.shift
puts look_up(ocn)
@buffer = []
@warnings = []

ARGV.each do |ocn|
look_up(ocn)
end
puts JSON.pretty_generate(@buffer)
warn @warnings.join("\n")
end

def look_up(ocn)
cluster = Cluster.find_by(ocns: ocn.to_i)
if cluster.nil?
"No cluster found for OCN #{ocn}."
@warnings << "# Warning: No cluster found for OCN #{ocn}."
else
JSON.pretty_generate(cluster.as_document)
@buffer << cluster.as_document
end
end

Expand Down
27 changes: 19 additions & 8 deletions lib/cluster.rb
Original file line number Diff line number Diff line change
Expand Up @@ -17,24 +17,35 @@
class Cluster
include Mongoid::Document
store_in collection: "clusters"

# Cluster level stuff:
field :ocns
field :last_modified, type: DateTime
index({ocns: 1}, unique: true, partial_filter_expression: {ocns: {:$gt => 0}})
index({last_modified: 1})
scope :for_ocns, ->(ocns) { where(:ocns.in => ocns) }

# Holdings level stuff:
embeds_many :holdings, class_name: "Clusterable::Holding"

# HtItems level stuff:
embeds_many :ht_items, class_name: "Clusterable::HtItem"
embeds_many :ocn_resolutions, class_name: "Clusterable::OCNResolution"
embeds_many :commitments, class_name: "Clusterable::Commitment"
index({ocns: 1},
unique: true,
partial_filter_expression: {ocns: {:$gt => 0}})
index({"ht_items.item_id": 1}, unique: true, sparse: true)
scope :with_ht_item, ->(ht_item) { where("ht_items.item_id": ht_item.item_id) }

# OCNResolution level stuff:
embeds_many :ocn_resolutions, class_name: "Clusterable::OCNResolution"
index({"ocn_resolutions.ocns": 1}, unique: true, sparse: true)
index({last_modified: 1})
scope :for_resolution, lambda { |resolution|
where(:ocns.in => [resolution.deprecated, resolution.resolved])
}
scope :for_ocns, ->(ocns) { where(:ocns.in => ocns) }
scope :with_ht_item, ->(ht_item) { where("ht_items.item_id": ht_item.item_id) }

# Commitments level stuff:
embeds_many :commitments, class_name: "Clusterable::Commitment"
index({"commitments.phase": 1}, unique: false, sparse: true) # keep
index({"commitments.committed_date": 1}, unique: false, sparse: true) # discard once phase is set

# Hooks:
before_save { |c| c.last_modified = Time.now.utc }

validates_each :ocns do |record, attr, value|
Expand Down
9 changes: 9 additions & 0 deletions lib/clusterable/commitment.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# frozen_string_literal: true

require "mongoid"
require "shared_print/phases"

module Clusterable
# A shared print commitment
Expand All @@ -25,6 +26,7 @@ class Commitment
field :deprecation_status, type: String
field :deprecation_date, type: DateTime
field :deprecation_replaced_by, type: String
field :phase, type: Integer, default: 0

embedded_in :cluster

Expand All @@ -37,6 +39,7 @@ class Commitment
validates_inclusion_of :retention_condition, in: ["EXCELLENT", "ACCEPTABLE"], allow_nil: true
validate :deprecation_validation
validate :other_commitment_validation
validate :phase_validation

def initialize(_params = nil)
super
Expand Down Expand Up @@ -95,5 +98,11 @@ def other_commitment_validation
errors.add(:other_retention_date, "cannot be set if other_program is nil")
end
end

def phase_validation
unless SharedPrint::Phases.list.include?(phase)
errors.add(:phase, "Not a recognized phase")
end
end
end
end
28 changes: 12 additions & 16 deletions lib/shared_print/finder.rb
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,17 @@ def initialize(organization: [], ocn: [], local_id: [], deprecated: false, phase
@phase = phase
# Put together a query based on the criteria gathered.
@query = build_query
validate!
end

def validate!
if @phase.any?
@phase.each do |p|
unless SharedPrint::Phases.list.include?(p)
raise ArgumentError, "#{p} is not a recognized shared print phase"
end
end
end
end

# Yield matching clusters.
Expand Down Expand Up @@ -58,7 +69,7 @@ def build_query
q["commitments.local_id"] = {"$in": @local_id}
end
if @phase.any?
q["commitments.committed_date"] = {"$in": phase_to_date}
q["commitments.phase"] = {"$in": @phase}
end

q
Expand All @@ -79,20 +90,5 @@ def match?(commitment)
def empty_or_include?(arr, val)
arr.empty? || arr.include?(val)
end

def phase_to_date(phase = @phase)
# In shared print, we've accepted commitments in phases, with slightly different
# criteria. E.g. policies and condition were added as required fields for phase
# 3. Phases are associated with the date they were committed. So this is a map
# from phase to date, so we can find the commitments for a certain phase.
phase_hash = SharedPrint::Phases.phase_to_date
phase.uniq.map do |p|
if phase_hash.key?(p)
phase_hash[p]
else
raise ArgumentError, "Phase #{p} is not a recognized shared print phase"
end
end
end
end
end
62 changes: 38 additions & 24 deletions lib/shared_print/phase_3_validator.rb
Original file line number Diff line number Diff line change
Expand Up @@ -16,49 +16,63 @@ class Phase3Validator
attr_reader :path, :last_error, :log

def initialize(path)
# Input comes from a file.
# Set up a handle to read records from that file,
# and a loader to load those records into the db.
@path = path
@last_error = nil
@log = nil
@handle = Loader::SharedPrintLoader.filehandle_for(@path)
@loader = Loader::SharedPrintLoader.for(@path)

# Any commitment in Phase 3 should have 1+ of these policies:
@phase_3_required_policies = ["blo", "non-circ"]
@phase_3_date = DateTime.parse(SharedPrint::Phases::PHASE_3_DATE)
# Setup dirs
# Setup log file
if Settings.local_report_path.nil?
raise "Missing Settings.local_report_path"
end
FileUtils.mkdir_p(Settings.local_report_path)
base = File.basename(@path)
@last_error = nil
@log = File.open(File.join(Settings.local_report_path, base) + ".log", "w")

# Any commitment in Phase 3 should these fields set:
@phase_3 = SharedPrint::Phases::PHASE_3
@phase_3_required_policies = ["blo", "non-circ"] # at least one of these policies
@phase_3_date = DateTime.parse(SharedPrint::Phases::PHASE_3_DATE)
end

# Check all commitments in file and load the valid ones,
# log both loaded and non-valid commitments to file.
def run
# Setup log files
base = File.basename(@path)
@log = File.open(File.join(Settings.local_report_path, base) + ".log", "w")
@log.puts "Checking if commitments in #{@path} are valid..."
# Go through input and process
loader = Loader::SharedPrintLoader.for(@path)
handle = Loader::SharedPrintLoader.filehandle_for(@path)
handle.each do |line|
# commitment is an unsaved commitment until it has passed
# validation and is then saved by load()
commitment = loader.item_from_line(line)
commitment.committed_date = @phase_3_date
if pass_validation? commitment
loader.load commitment
@log.puts "Loaded #{commitment.inspect}"
else
@log.puts "Failed to load #{commitment.inspect}"
end
Thread.pass
end
load_all
ensure
# Close log files
@log.puts "Done."
@log.close
end

# Load all commitments given by handle
def load_all
@handle.each do |line|
# commitment is an unsaved commitment until it has passed
# validation and is then saved by load()
commitment = @loader.item_from_line(line)
load_one(commitment)
Thread.pass
end
end

# Load a single commitment, with the proper phase fields set.
def load_one(commitment)
commitment.committed_date = @phase_3_date
commitment.phase = @phase_3
if pass_validation? commitment
@loader.load commitment
@log.puts "Loaded #{commitment.inspect}"
else
@log.puts "Failed to load #{commitment.inspect}"
end
end

# Check if a given commitment is valid (for the phase 3 definition of valid).
# Log errors.
def pass_validation?(commitment)
Expand Down
50 changes: 50 additions & 0 deletions lib/shared_print/phase_updater.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# frozen_string_literal: true

require "mongo_updater"

# This is an outer wrapper for a MongoUpdater call.
# Objective: based on commitments.committed_date, set commitments.phase.
# Usage: bundle exec ruby get_by_date.rb <date_str> <phase>
# E.g. : bundle exec ruby get_by_date.rb "2023-01-31 00:00:00 UTC" 3

class PhaseUpdater
def initialize(date, phase)
# Get input
@date = date
@phase = phase

validate!
puts "Get commitments with committed_date #{@date}."
puts "Set phase to #{@phase}."
end

# Make sure date and phase look like they should.
def validate!
date_rx = /^\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}:\d{2}\s[A-Z]{3}$/
raise ArgumentError, "bad date: #{@date}" unless date_rx.match?(@date)

@phase = @phase.to_i
raise ArgumentError, "bad phase: #{@phase}" unless [0, 1, 2, 3].include?(@phase)
rescue ArgumentError => e
puts "ERROR: Failed validation: #{e.message}"
exit
end

# Pass on call to MongoUpdater which does all the lifting.
def run
puts "Started: #{Time.now.utc}"
res = MongoUpdater.update_embedded(
clusterable: "commitments",
matcher: {committed_date: @date},
updater: {phase: @phase}
)
puts res.inspect
puts "Finished: #{Time.now.utc}"
end
end

if $0 == __FILE__
date = ARGV.shift
phase = ARGV.shift
PhaseUpdater.new(date, phase).run
end
20 changes: 14 additions & 6 deletions lib/shared_print/phases.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,25 @@

module SharedPrint
class Phases
PHASE_1_DATE = "2017-09-30"
PHASE_2_DATE = "2019-02-28"
PHASE_3_DATE = "2023-01-31"
PHASE_0 = 0 # Default, has not associated date
PHASE_1 = 1
PHASE_2 = 2
PHASE_3 = 3
PHASE_1_DATE = "2017-09-30 00:00:00 UTC"
PHASE_2_DATE = "2019-02-28 00:00:00 UTC"
PHASE_3_DATE = "2023-01-31 00:00:00 UTC"

# Call .invert on this if you ever need reverse map
def self.phase_to_date
{
1 => PHASE_1_DATE,
2 => PHASE_2_DATE,
3 => PHASE_3_DATE
PHASE_1 => PHASE_1_DATE,
PHASE_2 => PHASE_2_DATE,
PHASE_3 => PHASE_3_DATE
}
end

def self.list
[PHASE_0, PHASE_1, PHASE_2, PHASE_3]
end
end
end
22 changes: 22 additions & 0 deletions lib/shared_print/select_distinct_phases.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# frozen_string_literal: true

require "basic_query_report"

# Aggregate query to get all committed_date from db.
query = [
{"$match": {"commitments.0": {"$exists": 1}}},
{"$unwind": "$commitments"},
{"$project": {commitments: 1}},
{"$group": {_id: {date: "$commitments.committed_date"}}}
]

BasicQueryReport.new.aggregate(query) do |res|
puts res["_id"]["date"]
end

# 2023-01-31 00:00:00 UTC
# 2022-01-01 00:00:00 UTC
# 2021-01-01 05:00:00 UTC
# 2017-09-30 04:00:00 UTC
# 2019-02-28 05:00:00 UTC
# 1970-01-01 00:00:01 UTC
Loading

0 comments on commit 65963cc

Please sign in to comment.