From 3e760e2ac4a35893a07cbeac83a7998925708dcc Mon Sep 17 00:00:00 2001 From: Monique Rio Date: Mon, 18 Sep 2023 17:03:40 -0400 Subject: [PATCH] adds specs for solr_docs_from_terms* --- .../solr_docs_from_terms_and_dump_files.rb | 90 ++++++++++-------- spec/fixtures/civil_war_dumpfile.jsonl.gz | Bin 0 -> 979 bytes spec/fixtures/civil_war_terms.tsv.gz | Bin 0 -> 5133 bytes ...olr_docs_from_terms_and_dump_files_spec.rb | 16 ++++ 4 files changed, 66 insertions(+), 40 deletions(-) create mode 100644 spec/fixtures/civil_war_dumpfile.jsonl.gz create mode 100644 spec/fixtures/civil_war_terms.tsv.gz create mode 100644 spec/integrations/solr_docs_from_terms_and_dump_files_spec.rb diff --git a/bin/subjects/solr_docs_from_terms_and_dump_files.rb b/bin/subjects/solr_docs_from_terms_and_dump_files.rb index 0122c425..8f7a7b70 100644 --- a/bin/subjects/solr_docs_from_terms_and_dump_files.rb +++ b/bin/subjects/solr_docs_from_terms_and_dump_files.rb @@ -3,55 +3,65 @@ require "time" require "authority_browse" -dumpfile = ARGV.shift -termsfile = ARGV.shift -outfile = ARGV.shift +module SubjectToSolrDocsWrapper + def self.run(dumpfile, termsfile, outfile) + warn "Loading the dumpfile. 500k entries, each dot is 100k" + s = Time.now + subjects = AuthorityBrowse::LocSKOSRDF::Subject::Subjects.load(dumpfile) -$stderr.sync = true + t = Time.now + warn "\nDumpfile loaded in #{(t - s) / 60} minutes" -unless dumpfile && termsfile && outfile - warn "\n\nUsage:" - warn " #{$0} " - warn "\n\n where:" - warn " _dumpfile_ is produced by the skos_to_dumpfile script" - warn " _termsfiles_ is a tab-delimited set of term-count pairs" - warn " _outfile_ is where you want the resulting solr docs to be" - warn "" - warn "The whole process balloons up to about 8GB, so allocate accordingly" - warn "\n\n" - exit 1 -end + warn "Load terms-with-counts file. 5.5M-ish terms, each dot is 100k." + subjects.load_terms(termsfile) -unless Pathname.new(dumpfile).exist? - warn "Dumpfile '#{dumpfile}' can't be found" -end + x = Time.now + warn "\nTerms file loaded in #{(x - t) / 60} minutes" -unless Pathname.new(termsfile).exist? - warn "Terms file '#{termsfile}' can't be found" -end + warn "Determine counts for the cross-references" + subjects.add_xref_counts! -warn "Loading the dumpfile. 500k entries, each dot is 100k" -s = Time.now -subjects = AuthorityBrowse::LocSKOSRDF::Subject::Subjects.load(dumpfile) + d = Time.now + warn "Cross-refs set up in #{d - x} seconds" -t = Time.now -warn "\nDumpfile loaded in #{(t - s) / 60} minutes" + warn "Dump solr docs to '#{outfile}'" + Zinzout.zout(outfile) do |out| + subjects.each { |s| out.puts s.to_solr_doc.to_json } + end + o = Time.now + warn "Solr documents dumped in #{(o - d) / 60} minutes" + end +end + +dumpfile = ARGV.shift +termsfile = ARGV.shift +outfile = ARGV.shift -warn "Load terms-with-counts file. 5.5M-ish terms, each dot is 100k." -subjects.load_terms(termsfile) +$stderr.sync = true -x = Time.now -warn "\nTerms file loaded in #{(x - t) / 60} minutes" +# :nocov: +if ENV["APP_ENV"] != "test" + unless dumpfile && termsfile && outfile + warn "\n\nUsage:" + warn " #{$0} " + warn "\n\n where:" + warn " _dumpfile_ is produced by the skos_to_dumpfile script" + warn " _termsfiles_ is a tab-delimited set of term-count pairs" + warn " _outfile_ is where you want the resulting solr docs to be" + warn "" + warn "The whole process balloons up to about 8GB, so allocate accordingly" + warn "\n\n" + exit 1 + end -warn "Determine counts for the cross-references" -subjects.add_xref_counts! + unless Pathname.new(dumpfile).exist? + warn "Dumpfile '#{dumpfile}' can't be found" + end -d = Time.now -warn "Cross-refs set up in #{d - x} seconds" + unless Pathname.new(termsfile).exist? + warn "Terms file '#{termsfile}' can't be found" + end -warn "Dump solr docs to '#{outfile}'" -Zinzout.zout(outfile) do |out| - subjects.each { |s| out.puts s.to_solr_doc.to_json } + SubjectToSolrDocsWrapper.run(dumpfile, termsfile, outfile) end -o = Time.now -warn "Solr documents dumped in #{(o - d) / 60} minutes" +# :nocov: diff --git a/spec/fixtures/civil_war_dumpfile.jsonl.gz b/spec/fixtures/civil_war_dumpfile.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..f72bb91f9acba68d65c9aadd3115c63d1cb119a2 GIT binary patch literal 979 zcmV;^11$U>iwFqju?S@V17m4+X>4D2VRBz&b!~8FX>4UKYIARHYyjn0T~FIE6n)>X zu=2D?O>s!mkGuif*tAVUte#-!f|8kMS|a9>%Qz2Tsk2NS{T3 zKI2m!p*x~n_d-bnpB zCE&@;P^wrv;gyrO<~2mmS^Qx@(x`;hoW7D|z?2g{6erydyaxPu7F15I1#&6Y52Dc9 z&tlF3-kC%wB6sgBUzrFH2XANvDj~iZ58=03iW?dc4I(%v>H7DlO5SM}hUZK$#p%y3 z8#<4V3*I+;Fw?9&gX`y3{3>bh%lBUQ{l_ru%@c&-2Vr0XAK#P}4(iOuIPm>OcAa%u z#7A5v8WBCQm>m_SGGeo9YpYuJ%QyhioldhLW7j=R27@uzaSdIajTP4h${t>?hM1xC z>oSVCZa>Y1=t?G==s?BtKrh_!GR1;+L9v_}!h}q+dgc0ctOW6+)S$-RIYXCI`R;^B~_!SnY!+09g(=43_OJ# z9)Bpp<7VU7%VM9)vBUtfNM`#Aj&?;{Z6G2|(`B$`1nrkIk_e)Jb+zVdws6b0X)9V$ z(8PYTk`>L7OGVG+6m*H1Fg4U=8ajdq%8-%yrZieWklH|$aHm>H!5Utx65%;=NqB39 z%P0E6@vRg?&+`LZiD8OdGVHRcj1n`e++1#IE$G!)!4T?IIx|z`+5oafbrme*~+=k~(UR@+P`YQzNHI+gkAlcHYzl+GYx457bt0fVl1XT%LZ@GF(<9J$os zLPiiq!8@qpF@r-_4B)OiWJ2#6`X$d17m4+X>4D2VRBz|WpZtEE_8Et0L48`ZzDN&=k8y@+qwal zVM&&49oz19+tV$#(drM(202hww8U|#s3EJ|qq*gjQ;^?~T=ulL*tW-k#$7ARiy!7a!HHLp;YT?0MHz#Efq|?~*>!5Piq|5Dm^x2Oj(} zK0^O+&)EvGFhD+zamW_`_+t3-bx3}}QNSrimn@0m4LbOWk6u-1yckq~c3~p%7sQS{ znGC(*@$t~}KG2wO94DM0V29!*DJI}XV9H|3BDAKyBqDSwt0mzgMm`HevLHLIB_(o* zZjY{yb|eu6?2&MEFvUVhSvL|<2n!ZkSARCnR(}P{hUGUS6z~!(StKIXcfM^l7dVXR zioqrwOan8*i_yyqPJ>0vqC+$#JIV=2ES5k59b7Pxl5!N-1!wFb+yLe+`Hp^M{6X&5 zkCw>-d~$yeTmifR=n{)KBx)YFN9e~L7j=o_B@3yKE|&@Sm!9_}0<90wEXFYrZA%-z zyjbDG`EV8_HyI;>kHKX=GKmld%VXOdP>H?}D-g z$Lr+=l)?=|eWf@;oP?Czd)|zJsEBZ8Ssny8_&Ye!EW#m*FbDfXAAy(aWGizkwSO=v z{|i-SQ+%Tl3JYWRVt8yD0wel z_{DIr{qYdJg;PLyZ8cki6`>7}<0bM*1pGJE1(Z0qjK@neIW<%zHjvgkCFXf=BY)$E zY)t}+g_*!CnR_$5#2l75X#oIHV?mIJlYrKRZ38cNrj2>2lI>*8q8smq*FMW{M8q#n zZ6ludhC^&&nzkk{!vrPs)j9!ztHd*Orv|`b#{gQ2-|XjzfBcfFMg8q7QOwxzR!a7{_x6#~}LPwC4P(CtXKbNC`)p6je~- zXI@krqVuruLA6;zSRtLFj1sR2|CKDH;B>+~i3rv35)QjK-uh%eZP{=x1f5zE1$2v^ z{8;m4<>)0|%~?R(_6ooVL_$`hVga7v2t#!7iOL}LD~5gCI8vzE&-40FvL%yCpuxfE zriUj&pU66ao$}qFlyQL_pF^YvY9>H~*6{x%w&AD8tbTU1L5r5!pfp+3d?yGMSuSGpY1IKyBXvXL3MX>irlHa!-y)tmzE#PnGTKMb%y2XDwau0@M>LsKFf z?0-4<_Ln7{Q@K8UO$CTqe~aOKMW>C!+Eh>xlRP+cFC!-W^Plfq8}j$P}z# zzx~V43FU5v8n6KEtXl@_lxKsNb+W;Ba+YeX{o z;YN%bkSB!?3xnP?puMMa9MTlK1KOQj5jY_*c32h#WY_~SZ0VM^X=O6d)KJ7aC}%== zeiFirK)KBF{_Q_vf`UKXJTeNIz~Ib2=x9#fK(2|_oIz3|XtdA?M4YPX!f_?O14~$+#9Pe6O?uW)%pg%cK7LX27HMojV0J#FFaZg* ziEPInfYr{MVW~A(Y4qy`*8tRD2ioSMX(p zW*ESb45q2BIU$iT7t+Lq|un0Aez$G8oa6YQs*BnLWt1iS@R#L41`u< zNqMz=J24A# zu`eRWcqex-usE9UbUyFjuRj}sf-X%CDD;#Q;rTA>gjVj{~M&X)D&<#dY zxo{AFF01A|!l-y=dwOC9IX^x2)D=Vh0>j9rNq5a_JZlAMN!uJMNd-u2+*d$;0{;lY zO+@K??FH^@#?=|jg3p>9Z=!%jMCe|p_zaWs$J~?f1!n|FUg!cDa#$GRRTG18414a{2+UW$io7OvZ+h zj{-kr(UivIhvbn&6ny%9ehO>W=RG(wbNS-~L@ANEo>UvML`!CvrX%u!%yA@7^aY_j ziH)VA4|Fao%yo@{j~keLZT49ZJlOuG3V-NBLIt{GF%!T2S%acM0OFD%OMOL^0jKqt z;hiscrc7qjvS>>nNb_xR-IV=`S9AJvLTukl16*6qEzks~l9+eDGU{;2bxT#@-*RoH zkJ_V)4APie)c}jcr2Z9RHqbXXSBxW3gVjpI8OvJ7-u1#UnwWlY+9d7La{u3!uS%3A- zu_$kNZ@<^sFD*y9u*pcH*ZXbM=WuPd~Po*6T;$=PT zYJiqG&nyY_N{=0bQ%G5&2Uo2s;63jvjw0%)`kkgEVP##I-zU5a{TxTefYLpwt%CFB zGw{pl#&w&55~s}4vf92Q@`gJf(k~0p1$}qL#$ST6EbOs@A+En%LiP=rE+jXa!8HL} z{D}e795jqtuvD#6qYYk3!-Fq5gW|$_3&-M)4TrK702BmvAiXEfvrdPg-AL&@mc>Wx z0GWP_&=uzO7}5cE1{rdU?g*%}m?wPUfbvDUm?FFGut3i{Pz`nP$Ei`4rE5Mh4VqW= zT`72N-)AHuDgXc<3h`>B7dmE?KyZgX5Xj@|C6?+)(}HHqr`5C5Fxb>awFRl&%{PafO3k;Z8mGBzEgTB>&iw1k&BMtY@TA{x^BwAZ?THAag%w4VUi zl`tgHnv64Ns}Yki&+GxAFR@K#aAW}cA%`Q?4Wu9FTh?qayb{fv@CheOi*k4K& zKbR57fdb4oud2-^HLP@Nwm#WCqZVM8m$2>l)hDf}*9MS&4x|~WMe~0oD`NU& zy(G~`n`+J5unu8F6H<-SxEbqz|LfoJ-~Kzms>?dW{6XAP5(bCpim}(qs&2l|1(d;#Bz%sGtjn z@r}GZddfSVw~vr>Jjy;-bZ`^dp27@XUeQPz0_34=82(meGZ~yqs1V00lTDwh>^7w< zFbn7krX=6zsCO#UHcApJ7o3WTUEcv^mxL{V)zjvRQQqje3ewB@6y1hg87^$>WjH4>ZBtR6OFDDt?X7OygR&FFGCW`K)54jUhwUP zX2_CF>mPO`TSjAD@GQr%dWC(UTy~!8IGnF*%sEvj@(|tO9Ss85CCHU`e7Bh)uUSY% zvx(cjo75kc9m_me+PNF{eb?!5QVXaw7DL#I$#7IATi{a!ZhKOfCO0I8Xx+$m{PMkY zv;M=zvh=3mIvOgJy@;wvmk;&Mf&n@o-J+-6q z4{1w_OUl}woXKZ$XQyBjvI|Pv=ABPeoy5@F38Yy$WXqP;1Jmn;B8^mYw?kga7MIeP)S6Fmya=>i+= zvMSz!76Ir+fr27H_KLn?UdCObIiw?nWwZlOv{>Xu~=pyy3=FnY5*kPj?3p? z;yZ-K@@@DJE@836`p829&IcWQaEPRbol!~`mp^ZBc)7kkyA(mGcsr*o96@sVD8p5t8nJ$Ri?pX*)<=0(VIXqS)_fkevGUs_Bj4PhMg zeTfbrWwD$WaOVzCMzX5AWY<*xBZLsJ`ltPXheOfB4XVxej$DXXFz+&l1d5yaD7$N_ zPyCLp?m3B{Bxl)DqAC^#Q(_JIC6HLj8vE|C(dgwT91^-vBrbQH?I#EXAYBVbERSRmKCkZKeX^_A5?F6u985E$K2u`J+>a@<$U>@2(Z?u*7?D;skamfBFLR7&gNN z-1Exk$!_o$mFMJc7qe0q?smWAOBw!3g&}G3(teP;Ay_0~vrW}F z+YWOJmN7&{w~^QeZdt^*6MWvkShQz%=2&@c?k12HY8RD)^NyGn$J>!w$ovN!knZxh z#49+_iyb6cV5(C-$8j9)#9x|rqY%!zk_s_h$;xmyG?lCEWf!bRrWG%eT|tKsIIlpl z)4x^jBh(G9C2#M@$>_KCviNHz)({DYFk63#r5_~4)$^VxnU8GO{k~3Cs)eT;)|3RP z!3JFyvg=URrSSbdrrcJT-i?8eReMu6468-Er5gHa?%o zVzkb7qV+9C*Bv(yG@r!Nu7uNQu=93hc~fOg8R~wtL6J9~;m)Gt6hx;Q-nwg+!Ufr@ zv%FUSwI2s08+o#)>`MhHTg)bVAmVwd|A+B4{B;zd05FBkaCeo5nc?g7S8E|vyT_4I zi?R9{z`fAsE0wdVj*ny|t46RDeJ-ZLP`;g8VVUnslV?-NzStVM?MFqb+=|?w$JK3{ ziTx1<&4%;}vb$lQYLPMm4F^VplX%%9tE(go3IBsoJzZ$d=>nql9{Jtid||_0MR*g2 zYO6qiveW+pvH${s(X?-tK7Kq>RK-CC z&?*F$!vLa)_r^i&r!Lh*K6R`E+15Ny&2{fArADLi$=(9UlB)7MmQwM)Imu4&XAxE3 zrn$=Lq*r@ZkWsFA8#c0xR%wOfyuaPS`jh>6qxL<2p zGCJ#38$KPOn<8tj7CA5eKbXGhm1)5k1D_co{UC38o-;+<+omQQH& znn)58YhO+~4bOWW1ZhL`p=@_wF*4a|# znMFv@!0D<({vw&$02kG(9Gh+04v(BPoFgRL6ike4ke`Up6Dt3~0+3b|0$zIS0+KVU zx!#B6KG%Kq`#=Bwm%sn3%RkFpHRAsV>}dB3@_hgRpSSE5 literal 0 HcmV?d00001 diff --git a/spec/integrations/solr_docs_from_terms_and_dump_files_spec.rb b/spec/integrations/solr_docs_from_terms_and_dump_files_spec.rb new file mode 100644 index 00000000..3829e6bd --- /dev/null +++ b/spec/integrations/solr_docs_from_terms_and_dump_files_spec.rb @@ -0,0 +1,16 @@ +require_relative "../../bin/subjects/solr_docs_from_terms_and_dump_files.rb" +RSpec.describe SubjectToSolrDocsWrapper do + before(:each) do + @dumpfile = "spec/fixtures/civil_war_dumpfile.jsonl.gz" + @termsfile = "spec/fixtures/civil_war_terms.tsv.gz" + @outfile = "tmp/outfile.json" + end + it "runs something" do + expect(File.exist?(@outfile)).to eq(false) + described_class.run(@dumpfile, @termsfile, @outfile) + expect(File.exist?(@outfile)).to eq(true) + end + after(:each) do + `rm tmp/*` + end +end