Skip to content

Commit

Permalink
adds specs for skos_to_db
Browse files Browse the repository at this point in the history
  • Loading branch information
niquerio committed Sep 18, 2023
1 parent 4428dea commit 0cd9cf4
Show file tree
Hide file tree
Showing 3 changed files with 99 additions and 66 deletions.
149 changes: 83 additions & 66 deletions bin/names/skos_to_db.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,79 +6,96 @@
require "authority_browse"
require "logger"

LOGGER = Logger.new($stderr)
# skos file is from https://id.loc.gov/download/ choose JSONLD bulk export for
# LC Name Authority File (LCNAF)

skosfile = ARGV.shift
dbfile = ARGV.shift
module SkosToDbWrapper
def self.run(skosfile, dbfile, logger=Logger.new($stderr))
db = AuthorityBrowse.db(dbfile)

db = AuthorityBrowse.db(dbfile)
# drop old names table
if db.tables.include? :names
logger.info "Dropping old names table"
db.drop_table(:names)
end

if db.tables.include? :names
LOGGER.info "Dropping old names table"
db.drop_table(:names)
end
# create the names table
logger.info "Creating table"
db.create_table(:names) do
String :id, primary_key: true
String :label
String :match_text
Boolean :xrefs
Boolean :deprecated
Integer :count, default: 0
String :json, text: true
end

LOGGER.info "Creating table"
db.create_table(:names) do
String :id, primary_key: true
String :label
String :match_text
Boolean :xrefs
Boolean :deprecated
Integer :count, default: 0
String :json, text: true
end
sequel_table = db[:names]
ds = sequel_table.prepare(:insert, :insert_full_hash, id: :$id, label: :$label,
match_text: :$match_text, deprecated: :$deprecated,
xrefs: :$xrefs, json: :$json)

sequel_table = db[:names]
ds = sequel_table.prepare(:insert, :insert_full_hash, id: :$id, label: :$label,
match_text: :$match_text, deprecated: :$deprecated,
xrefs: :$xrefs, json: :$json)

milemarker = Milemarker.new(batch_size: 100_000, name: "Add skos data to database", logger: LOGGER)
milemarker.log "Starting skos parsing"
sequel_table.db.transaction do
AuthorityBrowse::LocSKOSRDF::Name::Skosfile.new(skosfile).each_with_index do |e, i|
ds.call e.db_object
milemarker.increment_and_log_batch_line
end
milemarker.log_final_line
end
milemarker = Milemarker.new(batch_size: 100_000, name: "Add skos data to database", logger: logger)
milemarker.log "Starting skos parsing"
sequel_table.db.transaction do
AuthorityBrowse::LocSKOSRDF::Name::Skosfile.new(skosfile).each_with_index do |e, i|
ds.call e.db_object
milemarker.increment_and_log_batch_line
end
milemarker.log_final_line
end

milemarker.log "Adding indexes"
db.alter_table(:names) do
add_index :deprecated
add_index :label
add_index :match_text
add_index :xrefs
add_index [:match_text, :deprecated]
add_index :count
end
milemarker.log "Adding indexes"
db.alter_table(:names) do
add_index :deprecated
add_index :label
add_index :match_text
add_index :xrefs
add_index [:match_text, :deprecated]
add_index :count
end

# Buzz through all the items in the table that declare they have xrefs
# and add the labels for forward/backward see-alsos
# @param [Sequel::Dataset] sequel_table The table we're using
sequel_table = db[:names]

updater = sequel_table.where(id: :$id).prepare(:update, :json_update, json: :$json)

sequel_table.db.transaction do
milemarker = Milemarker.new(batch_size: 10_000, name: "xref resolution", logger: LOGGER)
milemarker.logger.info "Starting xref stuff"
sequel_table.where(xrefs: true).each do |rec|
e = AuthorityBrowse::LocSKOSRDF::Name::Entry.new_from_dumpline(rec[:json])
id = e.id
label = e.label
sequel_table.select(:id, :label, :json).where(id: e.xref_ids).each do |target_db_record|
target = AuthorityBrowse::LocSKOSRDF::Name::Entry.new_from_dumpline(target_db_record[:json])
e.add_see_also(target.id, target.label)
target.add_incoming_see_also(id, label)
updater.call(id: target.id, json: target.to_json)
rescue => err
require "pry"
binding.pry
# Buzz through all the items in the table that declare they have xrefs
# and add the labels for forward/backward see-alsos
# @param [Sequel::Dataset] sequel_table The table we're using
sequel_table = db[:names]

updater = sequel_table.where(id: :$id).prepare(:update, :json_update, json: :$json)

sequel_table.db.transaction do
milemarker = Milemarker.new(batch_size: 10_000, name: "xref resolution", logger: logger)
milemarker.logger.info "Starting xref stuff"
sequel_table.where(xrefs: true).each do |rec|
e = AuthorityBrowse::LocSKOSRDF::Name::Entry.new_from_dumpline(rec[:json])
id = e.id
label = e.label
sequel_table.select(:id, :label, :json).where(id: e.xref_ids).each do |target_db_record|
target = AuthorityBrowse::LocSKOSRDF::Name::Entry.new_from_dumpline(target_db_record[:json])
e.add_see_also(target.id, target.label)
target.add_incoming_see_also(id, label)
updater.call(id: target.id, json: target.to_json)
rescue => err
require "pry"
binding.pry
end
updater.call(id: e.id, json: e.to_json)
milemarker.increment_and_log_batch_line
end
milemarker.log_final_line
end
updater.call(id: e.id, json: e.to_json)
milemarker.increment_and_log_batch_line

db.disconnect

end
milemarker.log_final_line
end

skosfile = ARGV.shift
dbfile = ARGV.shift

# :nocov:
if ENV["APP_ENV"] != "test"
SkosToDbWrapper.run(skosfile, dbfile)
end
# :nocov:

Binary file added spec/fixtures/twain_skos.json.gz
Binary file not shown.
16 changes: 16 additions & 0 deletions spec/integrations/skos_to_db_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
require_relative "../../bin/names/skos_to_db.rb"
RSpec.describe SkosToDbWrapper do
before(:each) do
@skos_file = "spec/fixtures/twain_skos.json.gz"
@db_file = "tmp/database.db"
@logger = instance_double(Logger, info: nil)
@db = AuthorityBrowse.db(@db_file)
end
it "runs something" do
described_class.run(@skos_file, @db_file)
end
after(:each) do
`rm tmp/*`
@db.disconnect
end
end

0 comments on commit 0cd9cf4

Please sign in to comment.