Skip to content

Commit

Permalink
adds specs for solr_docs_from_terms*
Browse files Browse the repository at this point in the history
  • Loading branch information
niquerio committed Sep 18, 2023
1 parent 0e92c92 commit 3e760e2
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 40 deletions.
90 changes: 50 additions & 40 deletions bin/subjects/solr_docs_from_terms_and_dump_files.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,55 +3,65 @@
require "time"
require "authority_browse"

dumpfile = ARGV.shift
termsfile = ARGV.shift
outfile = ARGV.shift
module SubjectToSolrDocsWrapper
def self.run(dumpfile, termsfile, outfile)
warn "Loading the dumpfile. 500k entries, each dot is 100k"
s = Time.now
subjects = AuthorityBrowse::LocSKOSRDF::Subject::Subjects.load(dumpfile)

$stderr.sync = true
t = Time.now
warn "\nDumpfile loaded in #{(t - s) / 60} minutes"

unless dumpfile && termsfile && outfile
warn "\n\nUsage:"
warn " #{$0} <dumpfile> <termsfile> <outfile>"
warn "\n\n where:"
warn " _dumpfile_ is produced by the skos_to_dumpfile script"
warn " _termsfiles_ is a tab-delimited set of term-count pairs"
warn " _outfile_ is where you want the resulting solr docs to be"
warn ""
warn "The whole process balloons up to about 8GB, so allocate accordingly"
warn "\n\n"
exit 1
end
warn "Load terms-with-counts file. 5.5M-ish terms, each dot is 100k."
subjects.load_terms(termsfile)

unless Pathname.new(dumpfile).exist?
warn "Dumpfile '#{dumpfile}' can't be found"
end
x = Time.now
warn "\nTerms file loaded in #{(x - t) / 60} minutes"

unless Pathname.new(termsfile).exist?
warn "Terms file '#{termsfile}' can't be found"
end
warn "Determine counts for the cross-references"
subjects.add_xref_counts!

warn "Loading the dumpfile. 500k entries, each dot is 100k"
s = Time.now
subjects = AuthorityBrowse::LocSKOSRDF::Subject::Subjects.load(dumpfile)
d = Time.now
warn "Cross-refs set up in #{d - x} seconds"

t = Time.now
warn "\nDumpfile loaded in #{(t - s) / 60} minutes"
warn "Dump solr docs to '#{outfile}'"
Zinzout.zout(outfile) do |out|
subjects.each { |s| out.puts s.to_solr_doc.to_json }
end
o = Time.now
warn "Solr documents dumped in #{(o - d) / 60} minutes"
end
end

dumpfile = ARGV.shift
termsfile = ARGV.shift
outfile = ARGV.shift

warn "Load terms-with-counts file. 5.5M-ish terms, each dot is 100k."
subjects.load_terms(termsfile)
$stderr.sync = true

x = Time.now
warn "\nTerms file loaded in #{(x - t) / 60} minutes"
# :nocov:
if ENV["APP_ENV"] != "test"
unless dumpfile && termsfile && outfile
warn "\n\nUsage:"
warn " #{$0} <dumpfile> <termsfile> <outfile>"
warn "\n\n where:"
warn " _dumpfile_ is produced by the skos_to_dumpfile script"
warn " _termsfiles_ is a tab-delimited set of term-count pairs"
warn " _outfile_ is where you want the resulting solr docs to be"
warn ""
warn "The whole process balloons up to about 8GB, so allocate accordingly"
warn "\n\n"
exit 1
end

warn "Determine counts for the cross-references"
subjects.add_xref_counts!
unless Pathname.new(dumpfile).exist?
warn "Dumpfile '#{dumpfile}' can't be found"
end

d = Time.now
warn "Cross-refs set up in #{d - x} seconds"
unless Pathname.new(termsfile).exist?
warn "Terms file '#{termsfile}' can't be found"
end

warn "Dump solr docs to '#{outfile}'"
Zinzout.zout(outfile) do |out|
subjects.each { |s| out.puts s.to_solr_doc.to_json }
SubjectToSolrDocsWrapper.run(dumpfile, termsfile, outfile)
end
o = Time.now
warn "Solr documents dumped in #{(o - d) / 60} minutes"
# :nocov:
Binary file added spec/fixtures/civil_war_dumpfile.jsonl.gz
Binary file not shown.
Binary file added spec/fixtures/civil_war_terms.tsv.gz
Binary file not shown.
16 changes: 16 additions & 0 deletions spec/integrations/solr_docs_from_terms_and_dump_files_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
require_relative "../../bin/subjects/solr_docs_from_terms_and_dump_files.rb"
RSpec.describe SubjectToSolrDocsWrapper do
before(:each) do
@dumpfile = "spec/fixtures/civil_war_dumpfile.jsonl.gz"
@termsfile = "spec/fixtures/civil_war_terms.tsv.gz"
@outfile = "tmp/outfile.json"
end
it "runs something" do
expect(File.exist?(@outfile)).to eq(false)
described_class.run(@dumpfile, @termsfile, @outfile)
expect(File.exist?(@outfile)).to eq(true)
end
after(:each) do
`rm tmp/*`
end
end

0 comments on commit 3e760e2

Please sign in to comment.