Skip to content

Commit

Permalink
feat: can generate remediated authorities file
Browse files Browse the repository at this point in the history
Adds method for generating the conf/remediated_subjects.xml file. It
pulls the records from Alma and writes them to that conf file. In the
browse cli, the method `subjects generate_remediated_authorities_file`
is added that configuration pulling and updating.

A github action is added to call the `browse subjects
generate_remediate_authorities_file` cli command and generate a PR if
the content of the file has changed.
  • Loading branch information
niquerio committed Jul 26, 2024
1 parent e175dd9 commit 67d6ac4
Show file tree
Hide file tree
Showing 14 changed files with 196 additions and 5 deletions.
39 changes: 39 additions & 0 deletions .github/workflows/update-sh-config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
name: Update subject headings config file

on:
workflow_dispatch:
schedule:
- cron: '0 8 1 * *' #8AM first of the month


jobs:
update_subject_headings:
runs-on: ubuntu-latest
outputs:
sha: ${{ steps.cpr.outputs.pull-request-head-sha }}
steps:
- uses: actions/checkout@v4
- name: Create .env file
run: cat env.* > .env
- name: Load .env file
uses: xom9ikk/dotenv@v2
- name: Set up Ruby 3.3
uses: ruby/setup-ruby@v1
with:
ruby-version: '3.3'
bundler-cache: true
- name: set path
run: |
echo "$GITHUB_WORKSPACE/exe" >> $GITHUB_PATH
- name: get update
env:
ALMA_API_KEY: ${{ secrets.ALMA_API_KEY }}
SUBJECT_HEADING_REMEDIATION_SET_ID: ${{ vars.SUBJECT_HEADING_REMEDIATION_SET_ID }}
run: browse subjects generate_remediated_authorities_file
- name: Create Pull Request
id: cpr
uses: peter-evans/create-pull-request@v6
with:
commit-message: "update remediated subject headings config file"
title: Update remediated subject headings config file
reviewers: niquerio
2 changes: 2 additions & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@ gem "canister"
gem "rubyzip"
gem "semantic_logger"
gem "thor"
gem "marc"
gem "solr_cloud-connection", ">= 0.4.0"
gem "alma_rest_client", github: "mlibrary/alma_rest_client", tag: "v2.0.0"

gem "sqlite3", "~> 1.4", platforms: :mri
gem "jdbc-sqlite3", "~> 3.28", platforms: :jruby
Expand Down
42 changes: 42 additions & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
@@ -1,6 +1,28 @@
GIT
remote: https://github.com/mlibrary/alma_rest_client.git
revision: 9606225d82480b6d1568902813ae9018dd8c1acc
tag: v2.0.0
specs:
alma_rest_client (2.0.0)
activesupport (~> 7.0, >= 4.2)
faraday
faraday-retry
httpx
rexml

GEM
remote: https://rubygems.org/
specs:
activesupport (7.1.3.4)
base64
bigdecimal
concurrent-ruby (~> 1.0, >= 1.0.2)
connection_pool (>= 2.2.5)
drb
i18n (>= 1.6, < 2)
minitest (>= 5.1)
mutex_m
tzinfo (~> 2.0)
addressable (2.8.6)
public_suffix (>= 2.0.2, < 6.0)
ast (2.4.2)
Expand All @@ -10,30 +32,42 @@ GEM
canister (0.9.2)
coderay (1.1.3)
concurrent-ruby (1.2.2)
connection_pool (2.4.1)
crack (0.4.5)
rexml
diff-lcs (1.5.0)
docile (1.4.0)
dotenv (2.8.1)
drb (2.2.1)
faraday (2.7.12)
base64
faraday-net_http (>= 2.0, < 3.1)
ruby2_keywords (>= 0.0.4)
faraday-follow_redirects (0.3.0)
faraday (>= 1, < 3)
faraday-net_http (3.0.2)
faraday-retry (2.2.1)
faraday (~> 2.0)
ffi (1.16.3)
ffi-icu (0.5.2)
ffi (~> 1.0, >= 1.0.9)
hashdiff (1.0.1)
http-2-next (1.0.1)
httpx (1.1.5)
http-2-next (>= 1.0.1)
i18n (1.14.5)
concurrent-ruby (~> 1.0)
json (2.7.1)
language_server-protocol (3.17.0.3)
lint_roller (1.1.0)
marc (1.2.0)
rexml
scrub_rb (>= 1.0.1, < 2)
unf
method_source (1.0.0)
milemarker (1.0.0)
minitest (5.24.1)
mutex_m (0.2.0)
mysql2 (0.5.5)
parallel (1.23.0)
parser (3.2.2.4)
Expand Down Expand Up @@ -80,6 +114,7 @@ GEM
ruby-progressbar (1.13.0)
ruby2_keywords (0.0.5)
rubyzip (2.3.2)
scrub_rb (1.0.1)
semantic_logger (4.15.0)
concurrent-ruby (~> 1.0)
sequel (5.75.0)
Expand Down Expand Up @@ -110,6 +145,11 @@ GEM
standardrb (1.0.1)
standard
thor (1.3.0)
tzinfo (2.0.6)
concurrent-ruby (~> 1.0)
unf (0.1.4)
unf_ext
unf_ext (0.0.9.1)
unicode-display_width (2.5.0)
webmock (3.19.1)
addressable (>= 2.8.0)
Expand All @@ -121,6 +161,7 @@ PLATFORMS
x86_64-linux

DEPENDENCIES
alma_rest_client!
byebug
canister
concurrent-ruby (~> 1.1)
Expand All @@ -130,6 +171,7 @@ DEPENDENCIES
ffi-icu
httpx
jdbc-sqlite3 (~> 3.28)
marc
milemarker (~> 1.0)
mysql2
pry (~> 0.14)
Expand Down
12 changes: 12 additions & 0 deletions conf/remediated_subjects.xml

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions env.example
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
BIBLIO_SOLR="http://YOUR_SOLR_URL/solr/biblio"
ALMA_API_KEY="YOUR_API_KEY"
SUBJECT_HEADING_REMEDIATION_SET_ID="YOUR_SET_ID"
1 change: 1 addition & 0 deletions lib/authority_browse.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
require "byebug"
require "services"
require "concurrent"
require "alma_rest_client"

module AuthorityBrowse
end
Expand Down
17 changes: 17 additions & 0 deletions lib/authority_browse/subjects.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,23 @@ def kind
"subject"
end

def generate_remediated_authorities_file(file_path: S.remediated_subjects_file, set_id: S.subject_heading_remediation_set_id)
conn = Faraday.new do |conn|
conn.options.timeout = 10 * 60
end
client = AlmaRestClient::Client.new(conn)
resp = client.get_all(url: "conf/sets/#{set_id}/members", record_key: "members")
raise StandardError, "Couldn't retrieve authority set data for #{set_id}; #{resp.body}" if resp.status != 200
ids = resp.body["member"].map { |x| x["id"] }
File.open(file_path, "w") do |file|
ids.each do |id|
resp = client.get("bibs/authorities/#{id}", query: {view: "full"})
raise StandardError, "Couldn't retrieve authority data for #{id}" if resp.status != 200
file.puts(resp.body["anies"].first)
end
end
end

# Loads the subjects and subjecst_xrefs table with data from LOC
#
# @param loc_file_getter [Proc] when called needs to put a file with skos
Expand Down
8 changes: 8 additions & 0 deletions lib/browse.rb
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,14 @@ def load_solr_with_matched
def load_solr_with_unmatched
AuthorityBrowse::Subjects.load_solr_with_unmatched
end
desc "generate_remediated_authorities_file", "generates a new file with the remediation rules for authority records"
long_desc <<~DESC
Gets and writes the authority records from Alma that have the rules for
updating subject headings. The file is written to #{S.remediated_subjects_file}.
DESC
def generate_remediated_authorities_file
AuthorityBrowse::Subjects.generate_remediated_authorities_file
end
end

desc "solr SUBCOMMAND", "commands related to working with SolrCloud"
Expand Down
12 changes: 8 additions & 4 deletions lib/services.rb
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,14 @@
# Sick and tired of writting "Services" all the time
S = Services

S.register(:project_root) do
File.absolute_path(File.join(__dir__, ".."))
end

S.register(:subject_heading_remediation_set_id) { ENV["SUBJECT_HEADING_REMEDIATION_SET_ID"] }

S.register(:remediated_subjects_file) { File.join(S.project_root, "conf", "remediated_subjects.xml") }

# Add ENV variables from docker-compose
%w[DATABASE_ADAPTER MARIADB_ROOT_PASSWORD MARIADB_USER MARIADB_PASSWORD
DATABASE_HOST MARIADB_DATABASE].each do |e|
Expand Down Expand Up @@ -63,10 +71,6 @@
tag
end

S.register(:project_root) do
File.absolute_path(File.join(__dir__, ".."))
end

# Path to file for dumping generated solr docs before uploading to solr
S.register(:solr_docs_file) { "tmp/solr_docs.jsonl.gz" }

Expand Down
30 changes: 30 additions & 0 deletions spec/authority_browse/subjects_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,36 @@
it "has a .mutator_klass" do
expect(described_class.mutator_klass).to eq(AuthorityBrowse::DBMutator::Subjects)
end
context ".generate_remediated_authorities_file" do
let(:set_id) { "1234" }
let(:authority_record) { fixture("remediated_authority_record.json") }
let(:authority_record_id) { "98187481368106381" }
let(:authority_set) { fixture("authority_set.json") }
let(:stub_set_request) {
stub_alma_get_request(
url: "conf/sets/#{set_id}/members",
query: {limit: 100, offset: 0},
output: authority_set
)
}
let(:stub_authority_request) {
stub_alma_get_request(
url: "bibs/authorities/#{authority_record_id}",
query: {view: "full"},
output: authority_record
)
}
it "fetches authority records from the alma api for a given set and generates a file with a list of marcxml authorities" do
auth_stub = stub_authority_request
set_stub = stub_set_request
file_path = "#{S.project_root}/tmp/auth_file.xml"
described_class.generate_remediated_authorities_file(file_path: file_path, set_id: set_id)
expect(auth_stub).to have_been_requested
expect(set_stub).to have_been_requested
output_str = File.read(file_path).strip
expect(output_str).to eq(JSON.parse(authority_record)&.dig("anies")&.first)
end
end

context ".reset_db" do
it "fetches and loads a skos file into :subjects and :subjects_xrefs" do
Expand Down
2 changes: 1 addition & 1 deletion spec/browse_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
end
end
RSpec.describe Browse::CLI::Subjects do
subjects_methods = [:update, :reset_db, :load_solr_with_matched, :load_solr_with_unmatched]
subjects_methods = [:update, :reset_db, :load_solr_with_matched, :load_solr_with_unmatched, :generate_remediated_authorities_file]
before(:each) do
subjects_methods.each do |method|
# verify that these methods exist before mocking them
Expand Down
9 changes: 9 additions & 0 deletions spec/fixtures/authority_set.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"member": [
{
"id": "98187481368106381",
"description": "Undocumented immigrants"
}
],
"total_record_count": 1
}
23 changes: 23 additions & 0 deletions spec/fixtures/remediated_authority_record.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
{
"mms_id": 98187481368106380,
"record_format": "marc21_authority",
"title": "Undocumented immigrants",
"created_by": "System",
"created_date": "2021-11-17Z",
"last_modified_by": "rednaal",
"last_modified_date": "2023-03-29Z",
"originating_system": "LIBRARY_OF_CONGRESS",
"originating_system_id": "98184898010106381",
"cataloging_level": {
"value": "00",
"desc": "Default Level"
},
"vocabulary": {
"value": "MIUSH",
"desc": "miush"
},
"anies": [
"<?xml version=\"1.0\" encoding=\"UTF-16\"?><record><leader>01200cz a2200301n 4500</leader><controlfield tag=\"005\">20230329130030.0</controlfield><controlfield tag=\"008\">030627i| anannbabn |a ana </controlfield><controlfield tag=\"001\">98187481368106381</controlfield><datafield ind1=\" \" ind2=\" \" tag=\"010\"><subfield code=\"a\">sh 85003553</subfield></datafield><datafield ind1=\" \" ind2=\" \" tag=\"035\"><subfield code=\"a\">(DLC)sh 85003553</subfield></datafield><datafield ind1=\" \" ind2=\" \" tag=\"035\"><subfield code=\"a\">(LIBRARY_OF_CONGRESS)98171057700000041</subfield></datafield><datafield ind1=\" \" ind2=\" \" tag=\"040\"><subfield code=\"a\">DLC</subfield><subfield code=\"c\">DLC</subfield><subfield code=\"d\">DLC</subfield><subfield code=\"d\">WaU</subfield></datafield><datafield ind1=\" \" ind2=\" \" tag=\"150\"><subfield code=\"a\">Undocumented immigrants</subfield></datafield><datafield ind1=\" \" ind2=\" \" tag=\"450\"><subfield code=\"a\">Undocumented foreign nationals</subfield></datafield><datafield ind1=\" \" ind2=\" \" tag=\"450\"><subfield code=\"a\">Illegal aliens</subfield></datafield><datafield ind1=\" \" ind2=\" \" tag=\"450\"><subfield code=\"a\">Aliens</subfield><subfield code=\"x\">Legal status, laws, etc.</subfield></datafield><datafield ind1=\" \" ind2=\" \" tag=\"450\"><subfield code=\"w\">nne</subfield><subfield code=\"a\">Aliens, Illegal</subfield></datafield><datafield ind1=\" \" ind2=\" \" tag=\"450\"><subfield code=\"a\">Illegal aliens</subfield><subfield code=\"x\">Legal status, laws, etc.</subfield></datafield><datafield ind1=\" \" ind2=\" \" tag=\"450\"><subfield code=\"a\">Illegal immigrants</subfield></datafield><datafield ind1=\" \" ind2=\" \" tag=\"450\"><subfield code=\"a\">Undocumented noncitizens</subfield></datafield><datafield ind1=\" \" ind2=\" \" tag=\"550\"><subfield code=\"w\">g</subfield><subfield code=\"a\">Aliens</subfield></datafield><datafield ind1=\" \" ind2=\" \" tag=\"550\"><subfield code=\"a\">Immigrant detention centers</subfield></datafield><datafield ind1=\" \" ind2=\" \" tag=\"550\"><subfield code=\"a\">Human smuggling</subfield></datafield><datafield ind1=\" \" ind2=\" \" tag=\"550\"><subfield code=\"a\">Noncitizens</subfield></datafield><datafield ind1=\" \" ind2=\" \" tag=\"550\"><subfield code=\"a\">Illegal immigration</subfield></datafield><datafield ind1=\" \" ind2=\" \" tag=\"670\"><subfield code=\"a\">Work cat.: 2007017970: Illegal immigration, 2007:</subfield><subfield code=\"b\">eCIP data sheet (Illegal immigrants)</subfield></datafield><datafield ind1=\" \" ind2=\" \" tag=\"690\"><subfield code=\"a\">sla-lab updated based on DEIA Catalog Working Group changes May 2021; \"Undocumented immigrants\" term borrowed from Sears ; \"undocumented foreign national\" term from Bill H.R. 3776 (116th Congress)</subfield></datafield><datafield ind1=\" \" ind2=\" \" tag=\"690\"><subfield code=\"a\">sla-lab updated to include 550s for LCSH headings as references March 2023</subfield></datafield></record>"
],
"link": "https://api-na.hosted.exlibrisgroup.com/almaws/v1/bibs/authorities/98187481368106381"
}
2 changes: 2 additions & 0 deletions spec/spec_helper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
require "byebug"
require "webmock/rspec"
require "httpx/adapters/webmock"
require "alma_rest_client"
require "simplecov"
require "sequel"
SimpleCov.start
Expand Down Expand Up @@ -43,6 +44,7 @@
AuthorityBrowse::DB::Names.recreate_all_tables!

RSpec.configure do |config|
include AlmaRestClient::Test::Helpers
# Enable flags like --only-failures and --next-failure
config.example_status_persistence_file_path = ".rspec_status"

Expand Down

0 comments on commit 67d6ac4

Please sign in to comment.