From 4e91997dec989262ce82f88d7c89ad6eac2ca02d Mon Sep 17 00:00:00 2001 From: kbartlett Date: Thu, 3 Oct 2024 11:52:20 -0400 Subject: [PATCH 01/31] feat: connector for Neo4j --- datahub-web-react/src/images/neo4j.png | Bin 0 -> 12968 bytes .../docs/sources/neo4j/neo4j.md | 160 ++++++++ .../docs/sources/neo4j/neo4j_recipe.yml | 15 + .../recipes/neo4j_to_datahub.dhub.yaml | 15 + metadata-ingestion/setup.py | 5 + .../ingestion/source/neo4j/neo4j_source.py | 344 ++++++++++++++++++ .../tests/unit/test_neo4j_source.py | 157 ++++++++ .../main/resources/boot/data_platforms.json | 10 + 8 files changed, 706 insertions(+) create mode 100644 datahub-web-react/src/images/neo4j.png create mode 100644 metadata-ingestion/docs/sources/neo4j/neo4j.md create mode 100644 metadata-ingestion/docs/sources/neo4j/neo4j_recipe.yml create mode 100644 metadata-ingestion/examples/recipes/neo4j_to_datahub.dhub.yaml create mode 100644 metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py create mode 100644 metadata-ingestion/tests/unit/test_neo4j_source.py diff --git a/datahub-web-react/src/images/neo4j.png b/datahub-web-react/src/images/neo4j.png new file mode 100644 index 0000000000000000000000000000000000000000..b03b2a4532b3ba0329fdaccbb60a3a57b111ed4f GIT binary patch literal 12968 zcmW+-1y~#177XqV1&TYxrMP>s;!xb7xNC5Cid%7aEACzhUfiWXad&(Bzx@(6Azyay z-h0lRnK_$C<*zbm$RCkGAP}1D7fDrM{{%eM5TSv;eA%%Izz)hqRYn|Yd6dZ?cmr=D zFCz);f$dSE1NaXzt(Am?vX!YF2;})QJJmy7eepy1RF{W${sfyVfp@JcJmw)uk(LU9 z1w6qRXaIEyg^~eNMY8Bq5G))j0h*MF6fO#s3KUp0C;&TH#kR~$dO^;|%jng&L}%*K}mZl5v(uOi3=kBbcqu89t8Ju9{RFa?+@p~MO&mS zbUUehuD!htBoFN5tjY%+=tQ2B(*L*du;$Q;P+{XE=)?JoHpkA&tdvwUcS}<kpE#+%N z_OK)T`LdH536{{kq=b);k5&&IBB?jMc`HKl7S`>V8~M<>36b#FrDh8)hi-4RuezCY zrO3@m=`(ZsH|$Tk<4^ps=tThMg3arLu7r!llF7lssxoI!n#H(Pa{W(l!{2&G1ob)T z!+{R9+$)`EEEkk2jC#P!8iFz|rKkydlzZOh^L{u6LSO-xxSg}pnFz|$z1p_PB~^6R z1K%VUZ}KGeRU`iWU@na)4D=6J^*oSzAni2Fa4+-{2)qTm7Knie$|pyzf@b;wI}0NF z8*poc@*5%4h-V3bK@6=7AvO>0Cx~S0b1R4?_;?GR0Q$agG%jAi+Pl4DkZWaOk zN!0oWw;7_gDC&2@zvz!*eag7F2sGk?Ul6Oqo5e$wk<^jq#bc*ira%3IcE$S`awcv% z{rQIRhE&=JTOm}#h-3-ZqIa^6l^D4;1ZP{plldpYc@Siq|AAl)-7uhbK;nY_1vN(k zBrf?qg5M;D4O5F66e*33DcpedNp$Q7e-YfFxOal?Z+ed4P6=b+-#*73)n`}-O?$?p%fVR1uZZ+<1z+J>XE-mRKakAAcM$tk4wlW>?eFDiYKsl z%yAKF1KL}Hjx)TZEcjYsHvQQ{>w}6r`a>Z#H zUQBVM(GcbkXK|Em>}{xRnuITNP$S;yE=I71}a_3YPh-OaB39FwLDLvHAVmlK%lLTeDMY}cpzPxoo>-XtXNEyeA zhxbcJB^Lym{4j~z5gU?^;*?XT0n?p8r1YStdJe_MP^7s7YcXWTphH09F*q(9S5%oX=Y4Nn^if zpW-^_c%Sd@4Ki0TcLF1MMUEsY%JZjn)_wl<_Z#W2PE)D+tCkh}dAr-0mElZ>brT`G zLUSBTH9OA!JPv3=sTjuQ@4EP_ip`~Vo6Fj(NGD&HUQHZqb4@nQM4Hq+K3Q75&N?BW zA(Ti|5B%eJYG^eNPmSfT5a1v=CMFZG;;+yC_>(sKHajUho=?Ps-DSJuS6io3lgs*+ z_Xe8NwJW3Ju}7K{hD)uh|KI%Wgnu(9?8jDnZIjoX_$!b_oA||umM9)b#|(I3WY{uT zDV}wdXRu}VzCJgQY_k?Zc6WcddHinFcIUit6$irHBB|rqGS0I%ve>#8OV8`gtH`74 z+wVIL220RW)Ke%@)Jo`!M@yWCABG|gv=7h>nD!G5Y)ZCDB}>LgXG+^gbdnYm48X0@JLQoI~vwI3`xdlDEN(E$gQVTY8mJkw=%fRXM7v|W^ppg>#8TIr?!S*0cwon z!#^i6BoTo)o1a<=nF`d)FaIG%_=;r7(MOR_ZOkx3;pDL6nrN!3J&U@SqC4uFaFU{z z^g>HRzV~HJ+*;Dw<|7L;%PFo>SX%$Ftd^o-ws5*}dO|LHrd3)St4r!R#Nl^PgQkTW za#=xcxp01tW0oTmSleCWZuRHI&!oy?t!vHuYW8o9E)#!s;+qU>C8|xUj}2h_O@ zrP&m2F3rygfeuIe)3&y2!UIbdY6F7vxYKTb_N$%KoV=Vj`R=qDy}MSQ`o*^UHN0@f z(%Kb$A-7nk>fQF++dJXP?q;$ozwgH_XAWd_g#S8?!G$)Chg-{8aXhSR&K+)#&G)Cr z5|D8C@PV-Nn8BEs4dcy>_AKxFp{^6~^*xV>s0h6${cYVz!c}2A*{#1VNfW6M=RWts zJ@4PkSEq%C#@q(KC$@z9ke4D`Rl5eKF_WX4Un4iMrHnaW1tFf?B1!L!H&gRdw>kdz z(SKNd8xtCFSRYtF_%VBzUtIlWglMh11iHiF#t6gYrpU!y1cyygpZ75A1`1JbG zzn8t&A19rcBA{5Ip!7i9hg?6?uS__DK=h=tl49zfE2q9*KDa~8@8xGXquq_L*r}N0 z4fOPqEU=W%lo0_}(e%y<6C)a41wmSHDd8wl5yE7iT})bVF#?WKtn_1)-w`oUqp6u> z1FERasFKVq+AID1e=5vmoE=gp4Q*LYhGXOwsp-2_9O;&R&2~BZY#b99F6fGbZ5o3q zPG}kv8I;GIN3V{FNC7Jk9dILaeL{iJCc*WsQ~?ta)A_`9cry4?IpMPT{DQJ!E|d_z zQk#v@=D#nXyci_(L7aHk`)NfP>#8)p7AAv^+uikW?ItuyC`71X(R>S$T$$21&+ALV z4U0c$gW#>R{Hz3ebC9y00lV?#ktyA5$TYbac`pGBIY>03Kv`3d`=};RVAFGU6(Wg< z1qvDFKBmBEVq&K@3hC|S;U;@#USDHLW-s|2B~GYOCZ0jb2r{xKyW0&kReC$8D;32~T zfgZl!Qvig%X3lAt01>w8t{RNa3EB_r2)CEUmNH38!NIQykS#!`DojPBh~Z$9mHER8 zWg0~8iJCzA1oL}TbO*V%os|P*q^zRtwyp|S0CHQ=fmPL0DOYBQEC2>%O`$5$x&I4E z^b50%=Xxt$2%9wGUM5cJNM;ITedmzssnzr`98ZiC~4&{1qK&0pWZ7j z%~QMEv4QXHd}pVi)*krw=yr^;Rw0W!R&}iJP|lVyIV$K!Vh-0;{o+`SRwnc-eQ^v2 z2m~F1DTaAcyhz3Wi|^tMWu3oy;CA8Tw!wkH9fEV{T&9-|&52VLOu&Ri7N_M`6B9~M zM8R+3!M35>fTQQeDd7(5c?OnUGzOVLGNCN?`xLa&NsEAGSXv}RsIgv_V0}`b5$3Ek z5pVf4^wz&`59t|U%^M~-<>uo{-qZbvHhU*dshCZ`g|D``lv2xjes$9zy{x7mv~^W{#iM%nN{1OwK=(sR0u!ulWmSVvX$ZOcf^;C zGVgp-hk0Mq7!|p;I(&8ZiHXnt*2Bsi#vLpB^u1B0Ni153)kQq(luhPOR5t}pjE?eoUHx_Fc}>m9$w^95)6>&a zQK5h|kRxtvY>0`8ovyaTM@1b0vHJe@3cf$Hj@R-Y0R6SK6ml$eM;Euin1U#XAOD!5Rs zd*9OYs+el7g1c@pe^|GQ?dZ|a2`y@Ge|859*J0D|=HJZK)zy%D+OLLy`_o*ohs(s9;aiL}gVM6fP>d3h-lM~npv zWm)k?9YtCESy3rv9ckl4M zJ({;$`L)f)!4^0lUQZzrMl9(0gD*r+f?{tlXWrWPv}sXXTs+fyEew^=d8-$yWtlU_ z=C2ndmAU=fNO;JP^X})kp7b9-wkjOu_BY%wdtobn3 z0A!!jadiuSppuZG=}4RgE*u@oyJ4dRTI-K9w4e^}2p!zB&iDUEi~V{#MUkE&=&X4~ zR?66_&SN+FYXna^caw9Q*R@5}a&LnT5c|mWJwVVxCMRp?isnvyFZvLylRx8teV=Yd z$H#4{cjj%UGPzLnDo?PmumBfEFS1|j4b^d^$-e%ZiK!`CT@4onO3>EcUNKiY(Kx^6 z+jcnFN2M&CwwJ$2N}{_#>=vk?&d$zhY#mEkx+3d8=kvjM+n)Eg=Pey`d|2Q>ep{iB$X?H@ie#X1VhdXp+%y(_AevNe zk`5;cGs_AJ3uR&m*uPKbOxk=Noq^Ts3r6sN+{4eBDDjh0{Ab|x zayf#4_7N8cr|>w+6dk9QTDIrq3PKex7*tqTt@zq0pT+{@smF%b0dvdRY>mDED=RCg ztfFFUYD!67US3X)A3%w(U(tC_@~fIP@b#I?%gZr@Zl-xQeFpRp1k0D7FGt7@rgCd_ z^KroX9qt;9mfTr{rWzU=ZEbCqQ#nFD4;R}5;V395`msa}X*oHu)Kt}$9j*&Q$579%d{Xi@V zT@Ku%>hLQbkx&p=KOG!!Tx4SxNVEFUeueFj0xF@Hj5pim&s$Kz+g+V5~ zcId{HkLi0kgg+mosEgN-mX)=dFM+UY58eB4x$AMZ=JopIW_nj+(A91{5cc-^yg0gn zqa@gOl#gZ|%GjW@L3o22n06)B%m{(FB_ZN<%&W}r3sa78Hto!0)C;LemVc|$ZLtRc`rw?B z=xfYnc;`1DGR(}1#3l;pY3b-VQ?xbHx}MKErqa94);rFZ>I=nAmTj9oHhp=>$$LHV zPL~^H(u8)~hVyf{?J`+bm-GPM{9k09?ni4Shq4 z%_58G9j#)fy1M#Yp3^RTB@W%!Jm2y8>D#k)A{d7ws;zm3=PZYAtIkOPmPS60REQw0 z)oXn>U(EIt!vyegh@erY`j$P_#ofuZ#;_-@qw!3KGK89jrhX;W^QWhj%FfOX>OOcZ z`P16vAeNe?e}W^IriKOpT$6c`wLNc{^BX>M$B%cb4hV}+AZbMIWT6+bWgv~u{*=ax z;Q%ou{O2ULreY{^qs1Y&@(N{7~O91n)W$Hx_7WM{LlEOWfV*a0?i#AkWb zy$#A1^qK|0Fa_WAa%ZTjxf0L+?d|y-Er8dk7laD!GC*CL^ENv>+wJ>oP1J7~j!L-m z>(TKJU>U}}26}ow=dGpmi^Wd(kfKrWS$^5AYT8`)Y5te28XD$;ta|39$@_bI`7oRF zdI~v$8Ew4SFN9lhB+myT??@~!W(f%i=H}*Q-?VK))KLtP$7`D-MKEnvFr31h)+JmJLk&uzwJugjmmn8__Zx;=Tsw#nc zf9+M}QlL`rTHNV&y3$lvH!W-KQ4z8}=Q#4%%l5exeXF*%^9q^sv+~?BEV`kE1?}*f zwQOP$r{&~@aew{fCb;T|66} zg|bV#h*^SO4UBgHotQ+$8fMh)_Tg*gCWT$I0=0(r=l&sz^sCAbG)-`xp& zoDomfga4GIF5GW`YZ;+>>{hv10x#a(y`NpulUU+(e0mYZVRV~m_Asm?LcbjX4oGdqb;m?JT76WHHFE7pM&myb} ztf{mpE*Yl`Gj-h?OR1<}&>y`_>v@4A#-AkOgnh8LwPljNv_Z)$i5`+vNjB%Bp@IAC zVSTYk$Y}uz7&XJrjF@%`p673h`2Iy!cH0wMYR0;$q@*Nhcqn>f=aX6q+elNS^iCKU zfS&tJf05R2@1vumc^>bGUd^kgz(O-IFw7db4j)Fu#ck9X!_-VIz_=Ebpi=fe{+^uN zFWgnn^Els7oW+zI1n6^UXei!?4;8EZWB{~y6<~{D0#ueoza2L7lcZ<$xB8t4o=sbE zijdeHJE{tggm4F8pTQcvww*yz4l+EvJ%-*o_Cf^EKp2{fTj2sx7Z7Ee-GTSHUa>Akw0GrcnN*J~# z9tp{MzC`%RiEVY=@9}YUgei7l6adXR$@bWo(Wm=;vC?D%6w5&vQysj8KBTkG)v{$6 zC?ilvgy3@`#q&4mdvqFPKEIT5*)N!|cA>w6hfEU8Ge!gfc@YqehLEZ5cyy-&F%T1j z+KE&hTmGXl-tKW;qftmCp+|Rle}4{K2FqHleut*MK9E8aN>ucVe`aStFnu%1YoMd1 zX43gphF5X&liE!LN;F>q8&1X50vCq}b7J1UG#@tL=*Zf!o&{BWgGlao9rs34n69>_ z%tlH|N}x^aA9w>|d%=8>=w01=bY66$4)@c{U?^K5CMMJyi;9MZS(AECFRy&fuw6B6 z?Z>TPjMTI=8&lGHDt2~uGM@`5t>rAoNNx&QWdVm`YUEUgk3ob0l$1@$^rq$&gAUvc-oC`Ez6AEKeEavHq*9i*lvSzGhEJ-$YN z^ZbPP8IzH#NsqUB)d0TXmhyG@o2RFzvvWgyyi#>kbbXmRs0m;D09CsO6TYKJYw+amd^Qgt6(*~PNZ=Oi)^RPZOZ;q$> z{Ric-&wVi9$R$atlsDL~xm~>Hc&Do%S9h5HbP@jooExAtxmj72T1EdhGuva%d zood+AI~17908+BS#Q}Hj>@i5oDemxa{+A<8}0feXZi28DTYf z7C7*F8CS9aAkX*mbtV)Q@K}eGLX3@#g}`^Pu~cf9Aq@>2F^skhV0`XCug>0~?!Xp> zAES^oRxK?p8&fp;Vi*MUpCbOkJVT-|RB@g>v4omwuHK7i*sy_1=52XJ#FBljq+ ziU%zCW_x(^7slzC2DnnPiAM^avuhz2{a}Um?hV%NhoI6F8+&NHtse;LFl?D$c&5Jt zitc4y+Io0Afv>HzleloBNrk_F9{uCz2(U)AW@Y8d1Jdg07uC0dzO_$HNG-H2VytY# zu#^!6pt#xx!kAR*$P}8rO}bp^TWS2khiQw(?Rg^7_05M1sC&fKp$Ey*grWLpfD8O5 z1J)2eBAGkVH^IB0Hjx-SG_rudzia?3*4moEqpv;Iv*bzjzf1z^2gK8i!Y$&)eu zT0Ld$ynx;~v%{_e)K$;h({X#L5-45Ji(?;dC!Y(*pqXJ2j$t$HuV21=>+$pb@+ATs zvt9mQ{m#=q`nDZ~#|Vg;-ku?Hh%;b@B+|&Qk%oWdc`j;w>$oEg^!8McGR)mAKWIJ? z7JNQ4=I*fNUjLF1%;*3>fXn_kkd|prLt|rOMMXtGajaiqUtpGqDgf|x;wPWWGyI?xs~12FWo2idQVTp{4h;<%zfn0xxJ#)(LqlU@V=q)` z@8a?WpZEgOFN?<^hto1{!a}WQm3*U)B5CvnU|$8_7th=*8Xxn+Nq2VCRFdoJ>K+~* z0A9SJQFR*#gOF(!=Zf`m?*G1eIC|Q$+3k9$x@NFu1@E!c5H(hIs5Gz@UaWGxUBhSg zu%`2VmMKq+5sGykV%#V(EbPrdmBnt+w3^NGu@JLi`4q4N=dA&@u9+2UD4ieM9Y3J% z(F=GvNKI3^yvW%C9yL`}W);TT$suT}Xi8OkUaH`mK`f3dJcn^zQQW!HtF zFr}cDP3RrQ_Z#Gng_LUfoc9ZsFGL}8wMvXc8J48eYAx9>Rze z0C$dBum}%o2Gnh%#YF$B{;HgY#(dZP#xuZqvnDKbo|BH+Ps-mzG9yRLbbwL}M40Q| zXLJbg@p;jNnp`-bM!|QsYlOehtdx?_XkY}(7!MA5hkt_LhSTX0qr_!~?jB9%{lgzU zm!lzNKD+%`DR^z2;w)WXD=XmS;D-Mqqf|Fr=uZ;8ZaN6wza3bmIg`kXW5NZhx}V9t ziFll3-HMRK=yg7w72Bk}oYRSr2vEr;(j&|5PU@avxk%nI>$iWVq!br0`<7Q%=LndC z0R0-E!rGMRZHh#Lkf$dmCCS8oOi4`*^E9b|VFawzJ@8qZB_QRajs*T~O(|u_!7kN-Fw;(CwVspcE6R*md$J9{{OE$x(5* zw|d21=Uw+<=p@!Ht6AKfo)EQI#c*D6B$j)lp9kJ+|ZtEDXWj$*3#%7{6O~e>0V)ds)GIjpYalGc@0g2na7{dHrj9jRGrfK zrWe;{;`L|HE)x}V1-3ng&;QTxw2fpC)w&SN^gFTv!B2>x8Jw1`*C{fKra ze$Qy<7-gq9F9EumnsN}W zo}VPBcia|t#PXT4TruNN%-}EwC|vrRUK-G|@qgG3r_1pS^5>BCfHIZH;@)0mI+?IDMMBdu{UcJW` zHm^cVqmNow+riAWTokamg@idt;-5@oK;`x0oS!L>LVm2v(v`lx>I+Z|ZWo)xsGJ2l zGkpU6P)O`r!(MONorz_Idweu8KSO#5lk;dQ$4wh6l1=h^*hbLGXq|5W$W;%(1OPu9 z1p{M5feY}u0ZEg_YS`_0Nm-KQ?k%ObGzgf*ZyVrq<8kuVGp@USlX}2o+X0LrH(F!_ zkcdbRsgSqh9epu?1Awsw7+yWVcQt%}Dr-fU;Mgow^$kL`*MsziO*KA5?*TOB70`OB zYHHV4R~OvztCEO)wG02=G)UaTSR8K;9`d$l*O$|Et|ems;06Q)ifytmS|r_Uq=E6P zSD4@*HzNw?sD{%=sh6YW4E;=oHP+&~ z+rSwi5rkoz0E~TLiOL2r7fTG9_J3q$0Y?22Aoo5e3I%igZY!Xl5_x~AtnIe$#PvGI z9id~rJOJ7P!w;%v+ILr1Iz4``(X8ENjI7PJ%N&kdqFEDRt)OkgSUSJ&*3>IU8W+o=A22s z#b82ufa*tF+?(9@7di@xZNPx`pm}-tcWe;7zuHx!Ji>3T4VamC!?8qSG-Z0#j*~eB z^neOq1GwK@xmrA_P}ITiu!fNo%=KnW-Kc^p0F;*-%wN>HYD(fED0(6Kmb`~x8ugCTTHW*tE8F3~N|Z8z%wsZ&Vmb@PJ2P*f zm~9i)sf2lXJr`^AfqEcm*4n@rjG~*x7#X+1_U1Meh4=on*y9K~#vzM7q4)swl~~wVcQt3Whrx zCH*dGOf|}zkJ!S&hh(Uw+IlAD=y3A0Y)w^TDiJjxqM$ZjD=FOp&BqPi!D$c%R1(b# z#0pTVE-cu;cK&<#*u)Mqc{W8J4~S&_Vpaip9+T;vJ+u=oAhE8uPdu;c@@$vQmF-vAcR$-%+&AzG|C2yjr>y^pJ@Wa54TKCQvZ z*2y}q$LW!X=Bkw+SD%3y^)SPP-RboXgA0goK3JfLZYP z2tXd-GK63PAUHiejZVVv3YcvS49FtM;UOz1audVBk!2^Ptudd9DJUoae=eKfjiLBv zwpczRBLg@B&;exz`iT`4*sv7jIaGTAy@qB-%QCgBH(w_b794Lv4%c%E+Ci-zjj+YC^}r`+*<@vHGh7u# zL@db}`^MnntFAQsySS(=C)l&w9}cYWv=QDqM0wm5t$k2U2fzQnJv?}tT0D5+Cd~a( zRfCYWz)jj%mFocjAED&0;QjM+44u|W5@Jx@Zg0KPJt<#G zhMJB!+6DLh>59{fq5ZS`xxEU>suwsTK+L@9R>5MjROF@oa#bSLrr0PImZ8ZC2k6U1 z&ui)#P#Cs=D7MVuRg7W+tLW#Ox9akm{QZ6I)+N~A4XAj$U-Bn+b%N0&e2Bf8j=teG zwQ1Rrk)wtb#97= zYoW3B!xY$UAZMUl6r_NS2(6sycQ&g9_G5>82n~k|eSGE!=)a0u>&Gr@fDT?-5{{3F zCt9erG<#-~#jLQsllKBRK%+YupWDk?F^7B#eU$zxYHO5?cj3yA51$e~cGOo`wUZ@$M52XaKHadqq~_HV&_8Ktas^{lQ{)Kk{#Iyl(PniAgYK<_$N_ruh5|%%_5pRn>dX7f zyZiDsw`gSXGUT(Cl<7L9FHRVP6EkA<1pEoyhf(A30aIg;BDbnkolJr*@YyF)966VgbRDh@glYvgi z@wYb+rv2UuWVACF>4~BCN{3XtnmslSrKaySMSvrjps6Wxbus+|B%-YngH!0;dzw#6 z(Rr4hp8f!|YO%!uuNP?HhV!7go?9A!#x;0xgU%0`oQYy!L=-0(#_rrYIOaU9f%2K~ zVmr;>r&S+JN=jnCCdBp*EsrK%!w|d>1)A(m*=6qxY{LqP1pkg|9pqEcYMU`Oy%?gE)gb_=ET7Xy+ z8v+YLWE&>^$oUYDta1M%{Ffy0%t} +![Certified](https://img.shields.io/badge/support%20status-certified-brightgreen) +![Incubating](https://img.shields.io/badge/support%20status-incubating-blue) +![Testing](https://img.shields.io/badge/support%20status-testing-lightgrey) + +## Integration Details + + + +Neo4j metadata will be ingested into DataHub using Call apoc.meta.data(); The data that is returned will be parsed +and will be displayed as Nodes and Relationships in DataHub. Each object will be tagged with describing what kind of DataHub +object it is. The defaults are 'Node' and 'Relationship'. These tag values can be overwritten in the recipe. + + + +## Metadata Ingestion Quickstart + +### Prerequisites + +In order to ingest metadata from Neo4j, you will need: + +* Neo4j instance with APOC installed + + +### Install the Plugin(s) + +Run the following commands to install the relevant plugin(s): + +`pip install 'acryl-datahub[neo4j]'` + + +### Configure the Ingestion Recipe(s) + +Use the following recipe(s) to get started with ingestion. + +_For general pointers on writing and running a recipe, see our [main recipe guide](../README.md#recipes)._ + + + +
+ View All Recipe Configuartion Options + + | Field | Required | Default | Description | + |--------------------|:--------:|:---------------:|---------------------------------------| + | source | | | | + | `type` | ✅ | `neo4j` | A required field with a default value | + | config | | | | + | `uri` | ✅ | `default_value` | The URI for the Neo4j server | + | `username` | ✅ | None | Neo4j Username | + | `password` | ✅ | None | Neo4j Password + | `gms_server` | ✅ | None |Address for the gms server| + | `node_tag` | ❌ | `Node` |The tag that will be used to show that the Neo4j object is a Node| + | `relationship_tag` | ❌ | `Relationship` |The tag that will be used to show that the Neo4j object is a Relationship| + | `environment` | ✅ | None || + | sink | | || + | `type` | ✅ | None || + | conifg | | || + | `server` | ✅ | None || + +
+ +#### `'acryl-datahub[neo4j]'` + +```yml +source: + type: 'neo4j' + config: + uri: 'neo4j+ssc://host:7687' + username: 'neo4j' + password: 'password' + gms_server: &gms_server 'http://localhost:8080' + node_tag: 'Node' + relationship_tag: 'Relationship' + environment: 'PROD' + +sink: + type: "datahub-rest" + config: + server: *gms_server +``` + + + +### Sample data that is returned from Neo4j. This is the data that is parsed and used to create Nodes, Relationships. +Details can be found here: https://neo4j.com/labs/apoc/4.4/overview/apoc.meta/apoc.meta.schema/ + + Example relationship: + { + relationship_name: { + count: 1, + properties: {}, + type: "relationship" + } + } + + Example node: + { + key: Neo4j_Node, + value: { + count: 10, + labels: [], + properties: { + node_id: { + unique: true, + indexed: true, + type: "STRING", + existence: false + }, + node_name: { + unique: false, + indexed: false, + type: "STRING", + existence: false + } + }, + type: "node", + relationships: { + RELATIONSHIP_1: { + count: 10, + direction: "in", + labels: ["Node_1", "Node_2", "Node_3"], + properties: { + relationsip_name: { + indexed: false, + type: "STRING", + existence: false, + array: false + }, + relationship_id: { + indexed: false, + type: "INTEGER", + existence: false, + array: false + } + } + }, + RELATIONSHIP_2: { + count: 10, + direction: "out", + labels: ["Node_4"], + properties: { + relationship_name: { + indexed: false, + type: "STRING", + existence: false, + array: false + }, + relationship_id: { + indexed: false, + type: "INTEGER", + existence: false, + array: false + } + } + } + } + } + } diff --git a/metadata-ingestion/docs/sources/neo4j/neo4j_recipe.yml b/metadata-ingestion/docs/sources/neo4j/neo4j_recipe.yml new file mode 100644 index 00000000000000..71202d4216df91 --- /dev/null +++ b/metadata-ingestion/docs/sources/neo4j/neo4j_recipe.yml @@ -0,0 +1,15 @@ +source: + type: 'neo4j' + config: + uri: 'neo4j+ssc://host:7687' + username: 'neo4j' + password: 'password' + gms_server: &gms_server 'http://localhost:8080' + node_tag: 'Node' + relationship_tag: 'Relationship' + environment: 'PROD' + +sink: + type: "datahub-rest" + config: + server: *gms_server \ No newline at end of file diff --git a/metadata-ingestion/examples/recipes/neo4j_to_datahub.dhub.yaml b/metadata-ingestion/examples/recipes/neo4j_to_datahub.dhub.yaml new file mode 100644 index 00000000000000..71202d4216df91 --- /dev/null +++ b/metadata-ingestion/examples/recipes/neo4j_to_datahub.dhub.yaml @@ -0,0 +1,15 @@ +source: + type: 'neo4j' + config: + uri: 'neo4j+ssc://host:7687' + username: 'neo4j' + password: 'password' + gms_server: &gms_server 'http://localhost:8080' + node_tag: 'Node' + relationship_tag: 'Relationship' + environment: 'PROD' + +sink: + type: "datahub-rest" + config: + server: *gms_server \ No newline at end of file diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 301e571c96accc..819c35879f87b6 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -320,6 +320,8 @@ "Authlib", } +neo4j = {"neo4j", "pandas"} + # Note: for all of these, framework_common will be added. plugins: Dict[str, Set[str]] = { # Sink plugins. @@ -487,6 +489,7 @@ "qlik-sense": sqlglot_lib | {"requests", "websocket-client"}, "sigma": sqlglot_lib | {"requests"}, "sac": sac, + "neo4j": neo4j } # This is mainly used to exclude plugins from the Docker image. @@ -628,6 +631,7 @@ "qlik-sense", "sigma", "sac", + "neo4j" ] if plugin for dependency in plugins[plugin] @@ -744,6 +748,7 @@ "qlik-sense = datahub.ingestion.source.qlik_sense.qlik_sense:QlikSenseSource", "sigma = datahub.ingestion.source.sigma.sigma:SigmaSource", "sac = datahub.ingestion.source.sac.sac:SACSource", + "neo4j = datahub.ingestion.source.neo4j.neo4j_source:Neo4jSource", ], "datahub.ingestion.transformer.plugins": [ "pattern_cleanup_ownership = datahub.ingestion.transformer.pattern_cleanup_ownership:PatternCleanUpOwnership", diff --git a/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py b/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py new file mode 100644 index 00000000000000..becea2c3bd7630 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py @@ -0,0 +1,344 @@ +import logging +import time +from dataclasses import dataclass +from typing import Iterable, Optional + +import pandas as pd +from neo4j import GraphDatabase +from pydantic.fields import Field + +from datahub.configuration.common import ConfigModel +from datahub.emitter.mce_builder import ( + make_data_platform_urn, + make_dataset_urn, + make_tag_urn, +) +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SupportStatus, + config_class, + platform_name, + support_status, +) +from datahub.ingestion.api.source import Source, SourceReport +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph +from datahub.metadata.schema_classes import ( + AuditStampClass, + BooleanTypeClass, + DatasetPropertiesClass, + DateTypeClass, + GlobalTagsClass, + NumberTypeClass, + OtherSchemaClass, + SchemaFieldClass, + SchemaFieldDataTypeClass, + SchemaMetadataClass, + StringTypeClass, + TagAssociationClass, + UnionTypeClass, +) + +log = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + + +class Neo4jConfig(ConfigModel): + username: str = Field(default=None, description="Neo4j Username") + password: str = Field(default=None, description="Neo4j Password") + uri: str = Field(default=None, description="The URI for the Neo4j server") + gms_server: str = Field(default=None, description="Address for the gms server") + environment: str = Field(default=None, description="Neo4j env") + node_tag: str = Field( + default="Node", + description="The tag that will be used to show that the Neo4j object is a Node", + ) + relationship_tag: str = Field( + default="Relationship", + description="The tag that will be used to show that the Neo4j object is a Relationship", + ) + platform: str = Field(default="neo4j", description="Neo4j platform") + type_mapping = { + "string": StringTypeClass(), + "boolean": BooleanTypeClass(), + "float": NumberTypeClass(), + "integer": NumberTypeClass(), + "date": DateTypeClass(), + "relationship": StringTypeClass(), + "node": StringTypeClass(), + "local_date_time": DateTypeClass(), + "list": UnionTypeClass(), + } + + +@dataclass +class Neo4jSourceReport(SourceReport): + obj_failures: int = 0 + obj_created: int = 0 + + +@platform_name("Metadata File") +@config_class(Neo4jConfig) +@support_status(SupportStatus.CERTIFIED) +class Neo4jSource(Source): + def __init__(self, ctx: PipelineContext, config: Neo4jConfig): + self.ctx = ctx + self.config = config + self.report = Neo4jSourceReport() + + @classmethod + def create(cls, config_dict, ctx): + config = Neo4jConfig.parse_obj(config_dict) + return cls(ctx, config) + + def get_schema_field_class( + self, col_name: str, col_type: str, **kwargs + ) -> SchemaFieldClass: + if kwargs["obj_type"] == "node" and col_type == "relationship": + col_type = "node" + else: + col_type = col_type + return SchemaFieldClass( + fieldPath=col_name, + type=SchemaFieldDataTypeClass(type=self.config.type_mapping[col_type]), + nativeDataType=col_type, + description=col_type.upper() + if col_type in ("node", "relationship") + else col_type, + lastModified=AuditStampClass( + time=round(time.time() * 1000), actor="urn:li:corpuser:ingestion" + ), + ) + + def add_properties( + self, dataset: str, description=None, custom_properties=None + ) -> MetadataChangeProposalWrapper: + dataset_properties = DatasetPropertiesClass( + description=description, + customProperties=custom_properties, + ) + return MetadataChangeProposalWrapper( + entityUrn=make_dataset_urn( + platform=self.config.platform, name=dataset, env=self.config.environment + ), + aspect=dataset_properties, + ) + + def generate_neo4j_object( + self, platform: str, dataset: str, columns: list, obj_type=None + ) -> MetadataChangeProposalWrapper: + try: + fields = [ + self.get_schema_field_class(key, value.lower(), obj_type=obj_type) + for d in columns + for key, value in d.items() + ] + mcp = MetadataChangeProposalWrapper( + entityUrn=make_dataset_urn( + platform=platform, name=dataset, env=self.config.environment + ), + aspect=SchemaMetadataClass( + schemaName=dataset, + platform=make_data_platform_urn(platform), + version=0, + hash="", + platformSchema=OtherSchemaClass(rawSchema=""), + lastModified=AuditStampClass( + time=round(time.time() * 1000), + actor="urn:li:corpuser:ingestion", + ), + fields=fields, + ), + systemMetadata=DatasetPropertiesClass( + customProperties={"properties": "property on object"} + ), + ) + self.report.obj_created += 1 + return mcp + except Exception as e: + log.error(e) + self.report.obj_failures += 1 + + def add_tag_to_dataset( + self, table_name: str, tag_name: str + ) -> MetadataChangeProposalWrapper: + graph = DataHubGraph(DatahubClientConfig(server=self.config.gms_server)) + dataset_urn = make_dataset_urn( + platform=self.config.platform, name=table_name, env=self.config.environment + ) + current_tags: Optional[GlobalTagsClass] = graph.get_aspect( + entity_urn=dataset_urn, + aspect_type=GlobalTagsClass, + ) + tag_to_add = make_tag_urn(tag_name) + tag_association_to_add = TagAssociationClass(tag=tag_to_add) + + if current_tags: + if tag_to_add not in [x.tag for x in current_tags.tags]: + current_tags.tags.append(TagAssociationClass(tag_to_add)) + else: + current_tags = GlobalTagsClass(tags=[tag_association_to_add]) + return MetadataChangeProposalWrapper( + entityUrn=dataset_urn, + aspect=current_tags, + ) + + def get_neo4j_metadata(self, query: str) -> pd.DataFrame: + driver = GraphDatabase.driver( + self.config.uri, auth=(self.config.username, self.config.password) + ) + """ + This process retrieves the metadata for Neo4j objects using an APOC query, which returns a dictionary + with two columns: key and value. The key represents the Neo4j object, while the value contains the + corresponding metadata. + + When data is returned from Neo4j, much of the relationship metadata is stored with the relevant node's + metadata. Consequently, the objects are organized into two separate dataframes: one for nodes and one for + relationships. + + In the node dataframe, several fields are extracted and added as new columns. Similarly, in the relationship + dataframe, certain fields are parsed out, while others require metadata from the nodes dataframe. + + Once the data is parsed and these two dataframes are created, we combine a subset of their columns into a + single dataframe, which will be used to create the DataHub objects. + + See the docs for examples of metadata: metadata-ingestion/docs/sources/neo4j/neo4j.md + """ + log.info(f"{query}") + with driver.session() as session: + result = session.run(query) + data = [record for record in result] + log.info("Closing Neo4j driver") + driver.close() + + node_df = self.process_nodes(data) + rel_df = self.process_relationships(data, node_df) + + union_cols = ["key", "obj_type", "property_data_types", "description"] + df = pd.concat([node_df[union_cols], rel_df[union_cols]]) + + return df + + def process_nodes(self, data): + nodes = [record for record in data if record["value"]["type"] == "node"] + node_df = pd.DataFrame( + nodes, + columns=["key", "value"], + ) + node_df["obj_type"] = node_df["value"].apply( + lambda record: self.get_obj_type(record) + ) + node_df["relationships"] = node_df["value"].apply( + lambda record: self.get_relationships(record) + ) + node_df["properties"] = node_df["value"].apply( + lambda record: self.get_properties(record) + ) + node_df["property_data_types"] = node_df["properties"].apply( + lambda record: self.get_property_data_types(record) + ) + node_df["description"] = node_df.apply( + lambda record: self.get_node_description(record, node_df), axis=1 + ) + return node_df + + def process_relationships(self, data, node_df): + rels = [record for record in data if record["value"]["type"] == "relationship"] + rel_df = pd.DataFrame(rels, columns=["key", "value"]) + rel_df["obj_type"] = rel_df["value"].apply( + lambda record: self.get_obj_type(record) + ) + rel_df["properties"] = rel_df["value"].apply( + lambda record: self.get_properties(record) + ) + rel_df["property_data_types"] = rel_df["properties"].apply( + lambda record: self.get_property_data_types(record) + ) + rel_df["description"] = rel_df.apply( + lambda record: self.get_rel_descriptions(record, node_df), axis=1 + ) + return rel_df + + def get_obj_type(self, record: dict) -> str: + return record["type"] + + def get_rel_descriptions(self, record: dict, df: pd.DataFrame) -> str: + descriptions = [] + for _, row in df.iterrows(): + relationships = row.get("relationships", {}) + for relationship, props in relationships.items(): + if record["key"] == relationship: + if props["direction"] == "in": + for prop in props["labels"]: + descriptions.append( + f"({row['key']})-[{record['key']}]->({prop})" + ) + return "\n".join(descriptions) + + def get_node_description(self, record: dict, df: pd.DataFrame) -> str: + descriptions = [] + for _, row in df.iterrows(): + if record["key"] == row["key"]: + for relationship, props in row["relationships"].items(): + direction = props["direction"] + for node in set(props["labels"]): + if direction == "in": + descriptions.append( + f"({row['key']})<-[{relationship}]-({node})" + ) + elif direction == "out": + descriptions.append( + f"({row['key']})-[{relationship}]->({node})" + ) + + return "\n".join(descriptions) + + def get_property_data_types(self, record: dict) -> list[dict]: + return [{k: v["type"]} for k, v in record.items()] + + def get_properties(self, record: dict) -> str: + return record["properties"] + + def get_relationships(self, record: dict) -> dict: + return record.get("relationships", None) + + def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: + df = self.get_neo4j_metadata( + "CALL apoc.meta.schema() YIELD value UNWIND keys(value) AS key RETURN key, value[key] AS value;" + ) + for index, row in df.iterrows(): + try: + yield MetadataWorkUnit( + id=row["key"], + mcp_raw=self.generate_neo4j_object( + columns=row["property_data_types"], + dataset=row["key"], + platform=self.config.platform, + ), + ) + + yield MetadataWorkUnit( + id=row["key"], + mcp=self.add_tag_to_dataset( + table_name=row["key"], + tag_name=self.config.node_tag + if row["obj_type"] == "node" + else self.config.relationship_tag, + ), + ) + + yield MetadataWorkUnit( + id=row["key"], + mcp=self.add_properties( + dataset=row["key"], + custom_properties=None, + description=row["description"], + ), + ) + + except Exception as e: + raise e + + def get_report(self): + return self.report diff --git a/metadata-ingestion/tests/unit/test_neo4j_source.py b/metadata-ingestion/tests/unit/test_neo4j_source.py new file mode 100644 index 00000000000000..f410cbcfff3ff2 --- /dev/null +++ b/metadata-ingestion/tests/unit/test_neo4j_source.py @@ -0,0 +1,157 @@ +import unittest + +import pandas as pd + +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.source.neo4j.neo4j_source import Neo4jConfig, Neo4jSource + + +class TestNeo4j(unittest.TestCase): + def setUp(self): + self.neo = Neo4jSource(Neo4jConfig(), PipelineContext(run_id="test")) + self.record_1 = { + "count": 1, + "labels": [], + "properties": { + "id": { + "unique": True, + "indexed": True, + "type": "STRING", + "existence": False, + }, + }, + "type": "node", + "relationships": { + "RELATIONSHIP_1": { + "count": 0, + "direction": "out", + "labels": ["Label_1"], + "properties": {}, + } + }, + } + self.record_2 = { + "count": 2, + "labels": [], + "properties": { + "id": { + "unique": True, + "indexed": True, + "type": "STRING", + "existence": False, + }, + "amount": { + "unique": True, + "indexed": True, + "type": "INTEGER", + "existence": False, + }, + }, + "type": "node", + "relationships": { + "RELATIONSHIP_1": { + "count": 0, + "direction": "out", + "labels": ["Label_1"], + "properties": {}, + }, + "RELATIONSHIP_2": { + "count": 1, + "direction": "in", + "labels": ["Label_1", "Label_2"], + "properties": {}, + }, + }, + } + self.record_3 = {"count": 3, "properties": {}, "type": "relationship"} + self.record_4 = { + "RELATIONSHIP_2": { + "count": 4, + "properties": {}, + "type": "relationship", + "relationships": { + "RELATIONSHIP_1": { + "count": 0, + "direction": "out", + "labels": ["Label_1"], + "properties": {}, + }, + "RELATIONSHIP_2": { + "count": 1, + "direction": "in", + "labels": ["Label_1", "Label_2"], + "properties": {}, + }, + }, + } + } + + def create_df(self): + data = { + "key": ["item1", "item2", "item3", "RELATIONSHIP_2"], + "value": [ + self.record_1, + self.record_2, + self.record_3, + self.record_4, + ], + } + df = pd.DataFrame(data) + return df + + + def test_get_obj_type(self): + assert self.neo.get_obj_type(self.record_1) == "node" + assert self.neo.get_obj_type(self.record_2) == "node" + assert self.neo.get_obj_type(self.record_3) == "relationship" + + def test_get_relationships(self): + assert self.neo.get_relationships(self.record_1, self.create_df()) == { + "RELATIONSHIP_1": { + "count": 0, + "direction": "out", + "labels": ["Label_1"], + "properties": {}, + } + } + assert self.neo.get_relationships(self.record_2, self.create_df()) == { + "RELATIONSHIP_1": { + "count": 0, + "direction": "out", + "labels": ["Label_1"], + "properties": {}, + }, + "RELATIONSHIP_2": { + "count": 1, + "direction": "in", + "labels": ["Label_1", "Label_2"], + "properties": {}, + }, + } + assert self.neo.get_relationships(self.record_3, self.create_df()) is None + + def test_get_property_data_types(self): + record_1 = self.record_1.get("properties", None) + record_2 = self.record_2.get("properties", None) + assert self.neo.get_property_data_types(record_1) == [{"id": "STRING"}] + assert self.neo.get_property_data_types(record_2) == [ + {"id": "STRING"}, + {"amount": "INTEGER"}, + ] + + def test_get_properties(self): + assert self.neo.get_properties(self.record_1) == { + "id": { + "unique": True, + "indexed": True, + "type": "STRING", + "existence": False, + }, + } + assert self.neo.get_properties(self.record_2) == self.record_2.get( + "properties", None + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/metadata-service/war/src/main/resources/boot/data_platforms.json b/metadata-service/war/src/main/resources/boot/data_platforms.json index 4830311996fd94..c792a247b34549 100644 --- a/metadata-service/war/src/main/resources/boot/data_platforms.json +++ b/metadata-service/war/src/main/resources/boot/data_platforms.json @@ -674,5 +674,15 @@ "type": "OTHERS", "logoUrl": "/assets/platforms/saclogo.svg" } + }, + { + "urn": "urn:li:dataPlatform:neo4j", + "aspect": { + "datasetNameDelimiter": ".", + "name": "Neo4j", + "displayName": "Neo4j", + "type": "OTHERS", + "logoUrl": "/assets/platforms/neo4j.png" + } } ] From 53c2463831fa2b964a4158de348a953a8de05165 Mon Sep 17 00:00:00 2001 From: kbartlett Date: Thu, 10 Oct 2024 15:23:12 -0400 Subject: [PATCH 02/31] feat: connector for Neo4j --- .../docs/sources/neo4j/neo4j_recipe.yml | 4 +-- .../recipes/neo4j_to_datahub.dhub.yaml | 4 +-- .../ingestion/source/neo4j/neo4j_source.py | 36 +++++++++++-------- 3 files changed, 25 insertions(+), 19 deletions(-) diff --git a/metadata-ingestion/docs/sources/neo4j/neo4j_recipe.yml b/metadata-ingestion/docs/sources/neo4j/neo4j_recipe.yml index 71202d4216df91..af5985b1575e2c 100644 --- a/metadata-ingestion/docs/sources/neo4j/neo4j_recipe.yml +++ b/metadata-ingestion/docs/sources/neo4j/neo4j_recipe.yml @@ -4,7 +4,7 @@ source: uri: 'neo4j+ssc://host:7687' username: 'neo4j' password: 'password' - gms_server: &gms_server 'http://localhost:8080' + gms_server: 'http://localhost:8080' node_tag: 'Node' relationship_tag: 'Relationship' environment: 'PROD' @@ -12,4 +12,4 @@ source: sink: type: "datahub-rest" config: - server: *gms_server \ No newline at end of file + server: 'http://localhost:8080' \ No newline at end of file diff --git a/metadata-ingestion/examples/recipes/neo4j_to_datahub.dhub.yaml b/metadata-ingestion/examples/recipes/neo4j_to_datahub.dhub.yaml index 71202d4216df91..af5985b1575e2c 100644 --- a/metadata-ingestion/examples/recipes/neo4j_to_datahub.dhub.yaml +++ b/metadata-ingestion/examples/recipes/neo4j_to_datahub.dhub.yaml @@ -4,7 +4,7 @@ source: uri: 'neo4j+ssc://host:7687' username: 'neo4j' password: 'password' - gms_server: &gms_server 'http://localhost:8080' + gms_server: 'http://localhost:8080' node_tag: 'Node' relationship_tag: 'Relationship' environment: 'PROD' @@ -12,4 +12,4 @@ source: sink: type: "datahub-rest" config: - server: *gms_server \ No newline at end of file + server: 'http://localhost:8080' \ No newline at end of file diff --git a/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py b/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py index becea2c3bd7630..de4790d756dcdb 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py @@ -1,7 +1,7 @@ import logging import time from dataclasses import dataclass -from typing import Iterable, Optional +from typing import Dict, Iterable, Optional, Type, Union import pandas as pd from neo4j import GraphDatabase @@ -24,6 +24,7 @@ from datahub.ingestion.api.source import Source, SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph +from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaFieldDataType from datahub.metadata.schema_classes import ( AuditStampClass, BooleanTypeClass, @@ -33,7 +34,6 @@ NumberTypeClass, OtherSchemaClass, SchemaFieldClass, - SchemaFieldDataTypeClass, SchemaMetadataClass, StringTypeClass, TagAssociationClass, @@ -43,6 +43,18 @@ log = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) +_type_mapping: Dict[Union[Type, str], Type] = { + "list": UnionTypeClass, + "boolean": BooleanTypeClass, + "integer": NumberTypeClass, + "local_date_time": DateTypeClass, + "float": NumberTypeClass, + "string": StringTypeClass, + "date": DateTypeClass, + "node": StringTypeClass, + "relationship": StringTypeClass, +} + class Neo4jConfig(ConfigModel): username: str = Field(default=None, description="Neo4j Username") @@ -59,17 +71,6 @@ class Neo4jConfig(ConfigModel): description="The tag that will be used to show that the Neo4j object is a Relationship", ) platform: str = Field(default="neo4j", description="Neo4j platform") - type_mapping = { - "string": StringTypeClass(), - "boolean": BooleanTypeClass(), - "float": NumberTypeClass(), - "integer": NumberTypeClass(), - "date": DateTypeClass(), - "relationship": StringTypeClass(), - "node": StringTypeClass(), - "local_date_time": DateTypeClass(), - "list": UnionTypeClass(), - } @dataclass @@ -78,7 +79,7 @@ class Neo4jSourceReport(SourceReport): obj_created: int = 0 -@platform_name("Metadata File") +@platform_name("Neo4j",id="neo4j") @config_class(Neo4jConfig) @support_status(SupportStatus.CERTIFIED) class Neo4jSource(Source): @@ -92,6 +93,10 @@ def create(cls, config_dict, ctx): config = Neo4jConfig.parse_obj(config_dict) return cls(ctx, config) + def get_field_type(self, attribute_type: Union[type, str]) -> SchemaFieldDataType: + type_class: Optional[type] = _type_mapping.get(attribute_type) + return SchemaFieldDataType(type=type_class()) + def get_schema_field_class( self, col_name: str, col_type: str, **kwargs ) -> SchemaFieldClass: @@ -101,7 +106,7 @@ def get_schema_field_class( col_type = col_type return SchemaFieldClass( fieldPath=col_name, - type=SchemaFieldDataTypeClass(type=self.config.type_mapping[col_type]), + type=self.get_field_type(col_type), nativeDataType=col_type, description=col_type.upper() if col_type in ("node", "relationship") @@ -312,6 +317,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: yield MetadataWorkUnit( id=row["key"], mcp_raw=self.generate_neo4j_object( + # mcp=self.generate_neo4j_object( columns=row["property_data_types"], dataset=row["key"], platform=self.config.platform, From d02b3c10b5161f484f118f96393706c2d9fcf7fa Mon Sep 17 00:00:00 2001 From: kbartlett Date: Thu, 10 Oct 2024 16:23:02 -0400 Subject: [PATCH 03/31] feat: connector for Neo4j --- metadata-ingestion/examples/cli_usage/gen_schemas.py | 1 - .../api/entities/platformresource/platform_resource.py | 1 - .../entities/structuredproperties/structuredproperties.py | 1 - metadata-ingestion/src/datahub/ingestion/fs/s3_fs.py | 1 - .../src/datahub/ingestion/glossary/classification_mixin.py | 2 -- .../datahub/ingestion/source/bigquery_v2/bigquery_config.py | 1 - .../ingestion/source/bigquery_v2/bigquery_schema_gen.py | 3 --- .../ingestion/source/bigquery_v2/queries_extractor.py | 4 ---- .../datahub/ingestion/source/confluent_schema_registry.py | 1 - .../src/datahub/ingestion/source/data_lake_common/config.py | 1 - .../src/datahub/ingestion/source/datahub/datahub_source.py | 1 - .../src/datahub/ingestion/source/dynamodb/dynamodb.py | 1 - .../src/datahub/ingestion/source/looker/looker_common.py | 4 ---- .../src/datahub/ingestion/source/looker/looker_source.py | 2 -- .../ingestion/source/looker/looker_template_language.py | 3 --- .../ingestion/source/looker/lookml_concept_context.py | 2 -- .../src/datahub/ingestion/source/looker/lookml_source.py | 3 --- .../src/datahub/ingestion/source/looker/view_upstream.py | 3 --- .../src/datahub/ingestion/source/metadata/lineage.py | 1 - .../src/datahub/ingestion/source/neo4j/__init__.py | 0 metadata-ingestion/src/datahub/ingestion/source/nifi.py | 1 - .../ingestion/source/powerbi/m_query/native_sql_parser.py | 1 - .../src/datahub/ingestion/source/powerbi/m_query/parser.py | 1 - .../datahub/ingestion/source/powerbi/m_query/resolver.py | 5 ----- .../source/powerbi/rest_api_wrapper/powerbi_api.py | 1 - .../ingestion/source/powerbi_report_server/report_server.py | 1 - .../src/datahub/ingestion/source/redshift/redshift.py | 1 - .../src/datahub/ingestion/source/s3/source.py | 1 - metadata-ingestion/src/datahub/ingestion/source/sac/sac.py | 1 - .../ingestion/source/snowflake/snowflake_lineage_v2.py | 1 - .../src/datahub/ingestion/source/sql/cockroachdb.py | 1 - .../src/datahub/ingestion/source/sql/oracle.py | 6 ------ .../datahub/ingestion/source/state/entity_removal_state.py | 6 +++++- .../src/datahub/ingestion/source/tableau/tableau.py | 1 - .../ingestion/transformer/add_dataset_dataproduct.py | 1 - .../src/datahub/ingestion/transformer/add_dataset_tags.py | 1 - .../ingestion/transformer/extract_ownership_from_tags.py | 1 - .../datahub/ingestion/transformer/replace_external_url.py | 1 - .../src/datahub/ingestion/transformer/tags_to_terms.py | 1 - .../assertion/snowflake/metric_sql_generator.py | 1 - metadata-ingestion/src/datahub/specific/dashboard.py | 1 - .../src/datahub/sql_parsing/sql_parsing_aggregator.py | 1 - .../src/datahub/sql_parsing/sqlglot_lineage.py | 1 - .../src/datahub/sql_parsing/tool_meta_extractor.py | 1 - metadata-ingestion/src/datahub/testing/mcp_diff.py | 2 +- metadata-ingestion/src/datahub/utilities/mapping.py | 2 -- .../src/datahub/utilities/threaded_iterator_executor.py | 1 - .../tests/integration/azure_ad/test_azure_ad.py | 1 - .../tests/integration/bigquery_v2/test_bigquery_queries.py | 1 - metadata-ingestion/tests/integration/looker/test_looker.py | 1 - metadata-ingestion/tests/integration/lookml/test_lookml.py | 1 - metadata-ingestion/tests/integration/okta/test_okta.py | 3 --- metadata-ingestion/tests/integration/oracle/common.py | 1 - .../tests/integration/powerbi/test_powerbi.py | 6 ------ .../tests/integration/qlik_sense/test_qlik_sense.py | 2 -- metadata-ingestion/tests/integration/sigma/test_sigma.py | 3 --- metadata-ingestion/tests/integration/snowflake/common.py | 1 - .../tests/integration/tableau/test_tableau_ingest.py | 2 -- .../tests/integration/unity/test_unity_catalog_ingest.py | 1 - .../tests/performance/snowflake/test_snowflake.py | 1 - .../tests/unit/api/entities/common/test_serialized_value.py | 3 --- .../api/source_helpers/test_incremental_lineage_helper.py | 1 - .../tests/unit/sql_parsing/test_sql_aggregator.py | 1 - .../tests/unit/test_bigqueryv2_usage_source.py | 1 - metadata-ingestion/tests/unit/test_neo4j_source.py | 1 - metadata-ingestion/tests/unit/test_redshift_lineage.py | 1 - 66 files changed, 6 insertions(+), 105 deletions(-) create mode 100644 metadata-ingestion/src/datahub/ingestion/source/neo4j/__init__.py diff --git a/metadata-ingestion/examples/cli_usage/gen_schemas.py b/metadata-ingestion/examples/cli_usage/gen_schemas.py index 2fd4683347a3ba..80b2c6712977ad 100644 --- a/metadata-ingestion/examples/cli_usage/gen_schemas.py +++ b/metadata-ingestion/examples/cli_usage/gen_schemas.py @@ -28,7 +28,6 @@ class CorpGroupFile(BaseModel): with open("user/user.dhub.yaml_schema.json", "w") as fp: - fp.write(json.dumps(CorpUserFile.schema(), indent=4)) with open("group/group.dhub.yaml_schema.json", "w") as fp: diff --git a/metadata-ingestion/src/datahub/api/entities/platformresource/platform_resource.py b/metadata-ingestion/src/datahub/api/entities/platformresource/platform_resource.py index 2b730ccb86f513..2aefcbd674f186 100644 --- a/metadata-ingestion/src/datahub/api/entities/platformresource/platform_resource.py +++ b/metadata-ingestion/src/datahub/api/entities/platformresource/platform_resource.py @@ -70,7 +70,6 @@ def to_resource_info(self) -> models.PlatformResourceInfoClass: class OpenAPIGraphClient: - ENTITY_KEY_ASPECT_MAP = { aspect_type.ASPECT_INFO.get("keyForEntity"): name for name, aspect_type in models.ASPECT_NAME_MAP.items() diff --git a/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py b/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py index 5b188edf9563bc..dab486c7f6f175 100644 --- a/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py +++ b/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py @@ -150,7 +150,6 @@ def create(file: str, graph: Optional[DataHubGraph] = None) -> None: @classmethod def from_datahub(cls, graph: DataHubGraph, urn: str) -> "StructuredProperties": - structured_property: Optional[ StructuredPropertyDefinitionClass ] = graph.get_aspect(urn, StructuredPropertyDefinitionClass) diff --git a/metadata-ingestion/src/datahub/ingestion/fs/s3_fs.py b/metadata-ingestion/src/datahub/ingestion/fs/s3_fs.py index a135b7b6ce8375..9c34c4f83b0a93 100644 --- a/metadata-ingestion/src/datahub/ingestion/fs/s3_fs.py +++ b/metadata-ingestion/src/datahub/ingestion/fs/s3_fs.py @@ -32,7 +32,6 @@ def __str__(self): class S3ListIterator(Iterator): - MAX_KEYS = 1000 def __init__( diff --git a/metadata-ingestion/src/datahub/ingestion/glossary/classification_mixin.py b/metadata-ingestion/src/datahub/ingestion/glossary/classification_mixin.py index 1d381acbf3dbe9..98c43079a3bc15 100644 --- a/metadata-ingestion/src/datahub/ingestion/glossary/classification_mixin.py +++ b/metadata-ingestion/src/datahub/ingestion/glossary/classification_mixin.py @@ -33,7 +33,6 @@ @dataclass class ClassificationReportMixin: - num_tables_fetch_sample_values_failed: int = 0 num_tables_classification_attempted: int = 0 @@ -112,7 +111,6 @@ def classify_schema_fields( schema_metadata: SchemaMetadata, sample_data: Union[Dict[str, list], Callable[[], Dict[str, list]]], ) -> None: - if not isinstance(sample_data, Dict): try: # TODO: In future, sample_data fetcher can be lazily called if classification diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py index 2a34fb82c2050c..e7b51f601d7f85 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py @@ -339,7 +339,6 @@ class BigQueryV2Config( StatefulProfilingConfigMixin, ClassificationSourceConfigMixin, ): - include_schema_metadata: bool = Field( default=True, description="Whether to ingest the BigQuery schema, i.e. projects, schemas, tables, and views.", diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py index 11d06771d4e4f4..a1173ef8efee50 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py @@ -304,7 +304,6 @@ def _process_project( project_id ) except Exception as e: - if self.config.project_ids and "not enabled BigQuery." in str(e): action_mesage = ( "The project has not enabled BigQuery API. " @@ -365,7 +364,6 @@ def _process_project_datasets( bigquery_project: BigqueryProject, db_tables: Dict[str, List[BigqueryTable]], ) -> Iterable[MetadataWorkUnit]: - db_views: Dict[str, List[BigqueryView]] = {} db_snapshots: Dict[str, List[BigqueryTableSnapshot]] = {} project_id = bigquery_project.id @@ -1004,7 +1002,6 @@ def get_tables_for_dataset( ) -> Iterable[BigqueryTable]: # In bigquery there is no way to query all tables in a Project id with PerfTimer() as timer: - # PARTITIONS INFORMATION_SCHEMA view is not available for BigLake tables # based on Amazon S3 and Blob Storage data. # https://cloud.google.com/bigquery/docs/omni-introduction#limitations diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py index b4a443673b9a97..27cd574c7766d0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py @@ -290,7 +290,6 @@ def get_workunits_internal( def deduplicate_queries( self, queries: FileBackedList[ObservedQuery] ) -> FileBackedDict[Dict[int, ObservedQuery]]: - # This fingerprint based deduplication is done here to reduce performance hit due to # repetitive sql parsing while adding observed query to aggregator that would otherwise # parse same query multiple times. In future, aggregator may absorb this deduplication. @@ -328,7 +327,6 @@ def deduplicate_queries( return queries_deduped def fetch_query_log(self, project: BigqueryProject) -> Iterable[ObservedQuery]: - # Multi-regions from https://cloud.google.com/bigquery/docs/locations#supported_locations regions = self.config.region_qualifiers @@ -341,7 +339,6 @@ def fetch_query_log(self, project: BigqueryProject) -> Iterable[ObservedQuery]: def fetch_region_query_log( self, project: BigqueryProject, region: str ) -> Iterable[ObservedQuery]: - # Each region needs to be a different query query_log_query = _build_enriched_query_log_query( project_id=project.id, @@ -435,7 +432,6 @@ def _build_enriched_query_log_query( start_time: datetime, end_time: datetime, ) -> str: - audit_start_time = start_time.strftime(BQ_DATETIME_FORMAT) audit_end_time = end_time.strftime(BQ_DATETIME_FORMAT) diff --git a/metadata-ingestion/src/datahub/ingestion/source/confluent_schema_registry.py b/metadata-ingestion/src/datahub/ingestion/source/confluent_schema_registry.py index 09ce8b5b05203c..ed51487ea6dab2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/confluent_schema_registry.py +++ b/metadata-ingestion/src/datahub/ingestion/source/confluent_schema_registry.py @@ -371,7 +371,6 @@ def _get_schema_fields( def _get_schema_metadata( self, topic: str, platform_urn: str, is_subject: bool ) -> Optional[SchemaMetadata]: - # Process the value schema schema, fields = self._get_schema_and_fields( topic=topic, diff --git a/metadata-ingestion/src/datahub/ingestion/source/data_lake_common/config.py b/metadata-ingestion/src/datahub/ingestion/source/data_lake_common/config.py index 5f88cf0234947a..ede7d3c3c56959 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/data_lake_common/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/data_lake_common/config.py @@ -7,7 +7,6 @@ class PathSpecsConfigMixin(ConfigModel): - path_specs: List[PathSpec] = Field( description="List of PathSpec. See [below](#path-spec) the details about PathSpec" ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py index 0204a864e2b9ea..7d2d99dbb3b23a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py @@ -96,7 +96,6 @@ def _get_database_workunits( ) mcps = reader.get_aspects(from_createdon, self.report.stop_time) for i, (mcp, createdon) in enumerate(mcps): - if not self.urn_pattern.allowed(str(mcp.entityUrn)): continue diff --git a/metadata-ingestion/src/datahub/ingestion/source/dynamodb/dynamodb.py b/metadata-ingestion/src/datahub/ingestion/source/dynamodb/dynamodb.py index acda656526ef53..9faa12d5d9bb61 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dynamodb/dynamodb.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dynamodb/dynamodb.py @@ -235,7 +235,6 @@ def _process_table( table_name: str, dataset_name: str, ) -> Iterable[MetadataWorkUnit]: - logger.debug(f"Processing table: {dataset_name}") table_info = dynamodb_client.describe_table(TableName=table_name)["Table"] account_id = table_info["TableArn"].split(":")[4] diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py index 3cbb13375229b9..df855ede985313 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py @@ -307,7 +307,6 @@ def view_fields_from_dict( type_cls: ViewFieldType, populate_sql_logic_in_descriptions: bool, ) -> "ViewField": - is_primary_key = field_dict.get("primary_key", "no") == "yes" name = field_dict["name"] @@ -988,13 +987,11 @@ def from_api( # noqa: C901 field_name_vs_raw_explore_field: Dict = {} if explore.fields is not None: - if explore.fields.dimensions is not None: for dim_field in explore.fields.dimensions: if dim_field.name is None: continue else: - field_name_vs_raw_explore_field[dim_field.name] = dim_field view_fields.append( @@ -1035,7 +1032,6 @@ def from_api( # noqa: C901 if measure_field.name is None: continue else: - field_name_vs_raw_explore_field[ measure_field.name ] = measure_field diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py index f269ccf1cd98f8..165d80b707000d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py @@ -609,7 +609,6 @@ def _get_folder_browse_path_v2_entries( def _create_platform_instance_aspect( self, ) -> DataPlatformInstance: - assert ( self.source_config.platform_name ), "Platform name is not set in the configuration." @@ -994,7 +993,6 @@ def _gen_folder_key(self, folder_id: str) -> LookerFolderKey: def _make_dashboard_and_chart_mces( self, looker_dashboard: LookerDashboard ) -> Iterable[Union[MetadataChangeEvent, MetadataChangeProposalWrapper]]: - # Step 1: Emit metadata for each Chart inside the Dashboard. chart_events = [] for element in looker_dashboard.dashboard_elements: diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_template_language.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_template_language.py index 1e60c08fe00c2b..6d49d57e077435 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_template_language.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_template_language.py @@ -55,7 +55,6 @@ def _create_new_liquid_variables_with_default( current_dict: dict = new_dict for key in keys[:-1]: - if key not in current_dict: current_dict[key] = {} @@ -392,7 +391,6 @@ def process_lookml_template_language( source_config: LookMLSourceConfig, view_lkml_file_dict: dict, ) -> None: - if "views" not in view_lkml_file_dict: return @@ -425,7 +423,6 @@ def load_and_preprocess_file( path: Union[str, pathlib.Path], source_config: LookMLSourceConfig, ) -> dict: - parsed = load_lkml(path) process_lookml_template_language( diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_concept_context.py b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_concept_context.py index ce4a242027e11a..80be566cdcd468 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_concept_context.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_concept_context.py @@ -320,7 +320,6 @@ def get_including_extends( self, field: str, ) -> Optional[Any]: - # According to Looker's inheritance rules, we need to merge the fields(i.e. dimensions, measures and # dimension_groups) from both the child and parent. if field in [DIMENSIONS, DIMENSION_GROUPS, MEASURES]: @@ -345,7 +344,6 @@ def _get_sql_table_name_field(self) -> Optional[str]: return self.get_including_extends(field="sql_table_name") def _is_dot_sql_table_name_present(self) -> bool: - sql_table_name: Optional[str] = self._get_sql_table_name_field() if sql_table_name is None: diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py index e4d8dd19fb7917..1096fd3fd3ccc7 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py @@ -143,7 +143,6 @@ def from_looker_dict( extract_col_level_lineage: bool = False, populate_sql_logic_in_descriptions: bool = False, ) -> Optional["LookerView"]: - view_name = view_context.name() logger.debug(f"Handling view {view_name} in model {model_name}") @@ -419,7 +418,6 @@ def _get_custom_properties(self, looker_view: LookerView) -> DatasetPropertiesCl def _build_dataset_mcps( self, looker_view: LookerView ) -> List[MetadataChangeProposalWrapper]: - view_urn = looker_view.id.get_urn(self.source_config) subTypeEvent = MetadataChangeProposalWrapper( @@ -503,7 +501,6 @@ def get_project_name(self, model_name: str) -> str: def get_manifest_if_present(self, folder: pathlib.Path) -> Optional[LookerManifest]: manifest_file = folder / "manifest.lkml" if manifest_file.exists(): - manifest_dict = load_and_preprocess_file( path=manifest_file, source_config=self.source_config ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/view_upstream.py b/metadata-ingestion/src/datahub/ingestion/source/looker/view_upstream.py index 057dbca4281849..7dd2f9cb203336 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/view_upstream.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/view_upstream.py @@ -72,7 +72,6 @@ def resolve_derived_view_urn_of_col_ref( base_folder_path: str, config: LookMLSourceConfig, ) -> List[ColumnRef]: - new_column_refs: List[ColumnRef] = [] for col_ref in column_refs: if is_derived_view(col_ref.table.lower()): @@ -641,7 +640,6 @@ def create_view_upstream( ctx: PipelineContext, reporter: LookMLSourceReport, ) -> AbstractViewUpstream: - if view_context.is_regular_case(): return RegularViewUpstream( view_context=view_context, @@ -666,7 +664,6 @@ def create_view_upstream( view_context.is_sql_based_derived_view_without_fields_case(), ] ): - return DerivedQueryUpstreamSource( view_context=view_context, config=config, diff --git a/metadata-ingestion/src/datahub/ingestion/source/metadata/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/metadata/lineage.py index 08ed7677c7ab4c..9f96f837eb9b3a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/metadata/lineage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/metadata/lineage.py @@ -210,7 +210,6 @@ def _get_lineage_mcp( # extract the old lineage and save it for the new mcp if preserve_upstream: - client = get_default_graph() old_upstream_lineage = get_aspects_for_entity( diff --git a/metadata-ingestion/src/datahub/ingestion/source/neo4j/__init__.py b/metadata-ingestion/src/datahub/ingestion/source/neo4j/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/metadata-ingestion/src/datahub/ingestion/source/nifi.py b/metadata-ingestion/src/datahub/ingestion/source/nifi.py index 7072ebf6473df1..f55d7a883edefe 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/nifi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/nifi.py @@ -464,7 +464,6 @@ def report_dropped(self, ent_name: str) -> None: @support_status(SupportStatus.CERTIFIED) @capability(SourceCapability.LINEAGE_COARSE, "Supported. See docs for limitations") class NifiSource(Source): - config: NifiSourceConfig report: NifiSourceReport diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py index 27efad6dc21caa..43c6c9aacd9ce8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py @@ -62,7 +62,6 @@ def parse_custom_sql( env: str, platform_instance: Optional[str], ) -> Optional["SqlParsingResult"]: - logger.debug("Using sqlglot_lineage to parse custom sql") sql_query = remove_special_characters(query) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py index 3edaaed2ff8148..a82384c6526b23 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py @@ -57,7 +57,6 @@ def get_upstream_tables( config: PowerBiDashboardSourceConfig, parameters: Dict[str, str] = {}, ) -> List[resolver.Lineage]: - if table.expression is None: logger.debug(f"There is no M-Query expression in table {table.full_name}") return [] diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py index 20fb0b5facbbc1..d8918da42ae338 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py @@ -64,7 +64,6 @@ def urn_creator( server: str, qualified_table_name: str, ) -> str: - platform_detail: PlatformDetail = platform_instance_resolver.get_platform_instance( PowerBIPlatformDetail( data_platform_pair=data_platform_pair, @@ -162,7 +161,6 @@ def create_reference_table( arg_list: Tree, table_detail: Dict[str, str], ) -> Optional[ReferencedTable]: - arguments: List[str] = tree_function.strip_char_from_list( values=tree_function.remove_whitespaces_from_list( tree_function.token_values(arg_list) @@ -202,7 +200,6 @@ def create_reference_table( def parse_custom_sql( self, query: str, server: str, database: Optional[str], schema: Optional[str] ) -> Lineage: - dataplatform_tables: List[DataPlatformTable] = [] platform_detail: PlatformDetail = ( @@ -703,7 +700,6 @@ def create_urn_using_old_parser( def create_lineage( self, data_access_func_detail: DataAccessFunctionDetail ) -> Lineage: - arguments: List[str] = tree_function.strip_char_from_list( values=tree_function.remove_whitespaces_from_list( tree_function.token_values(data_access_func_detail.arg_list) @@ -809,7 +805,6 @@ def form_qualified_table_name( table_reference: ReferencedTable, data_platform_pair: DataPlatformPair, ) -> str: - platform_detail: PlatformDetail = ( self.platform_instance_resolver.get_platform_instance( PowerBIPlatformDetail( diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py index a245d4c2b9a35d..3fb0beac272a6f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py @@ -422,7 +422,6 @@ def _fill_metadata_from_scan_result( return workspaces def _fill_independent_datasets(self, workspace: Workspace) -> None: - reachable_datasets: List[str] = [] # Find out reachable datasets for dashboard in workspace.dashboards: diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi_report_server/report_server.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi_report_server/report_server.py index 8854f9ff48348d..2a247d0c63957a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi_report_server/report_server.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi_report_server/report_server.py @@ -126,7 +126,6 @@ def log_http_error(e: BaseException, message: str) -> Any: def get_response_dict(response: requests.Response, error_message: str) -> dict: - result_dict: dict = {} try: response.raise_for_status() diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py index a9fc9ab8f3e993..ca1ffd5adb5676 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py @@ -436,7 +436,6 @@ def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit def _extract_metadata( self, connection: redshift_connector.Connection, database: str ) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]: - yield from self.gen_database_container( database=database, ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py index e8c70260ebc7ce..1863663f98bb24 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py @@ -804,7 +804,6 @@ def get_dir_to_process( protocol: str, min: bool = False, ) -> List[str]: - # if len(path_spec.include.split("/")) == len(f"{protocol}{bucket_name}/{folder}".split("/")): # return [f"{protocol}{bucket_name}/{folder}"] diff --git a/metadata-ingestion/src/datahub/ingestion/source/sac/sac.py b/metadata-ingestion/src/datahub/ingestion/source/sac/sac.py index 8309c469f67c5f..c7af7a44a37cab 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sac/sac.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sac/sac.py @@ -402,7 +402,6 @@ def get_model_workunits( columns = self.get_import_data_model_columns(model_id=model.model_id) for column in columns: - schema_field = SchemaFieldClass( fieldPath=column.name, type=self.get_schema_field_data_type(column), diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py index 6f9c9259b27844..4a03717754ec26 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py @@ -234,7 +234,6 @@ def populate_known_query_lineage( def get_known_query_lineage( self, query: Query, dataset_name: str, db_row: UpstreamLineageEdge ) -> Optional[KnownQueryLineageInfo]: - if not db_row.UPSTREAM_TABLES: return None diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/cockroachdb.py b/metadata-ingestion/src/datahub/ingestion/source/sql/cockroachdb.py index 5356cee7f6ea30..76b72d8e37f74b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/cockroachdb.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/cockroachdb.py @@ -28,7 +28,6 @@ class CockroachDBConfig(PostgresConfig): @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration") @capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion") class CockroachDBSource(PostgresSource): - config: CockroachDBConfig def __init__(self, config: CockroachDBConfig, ctx: PipelineContext): diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/oracle.py b/metadata-ingestion/src/datahub/ingestion/source/sql/oracle.py index dcc1340c81d7b7..45c96713ed4352 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/oracle.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/oracle.py @@ -173,7 +173,6 @@ def get_table_names(self, schema: Optional[str] = None) -> List[str]: ] def get_view_names(self, schema: Optional[str] = None) -> List[str]: - schema = self._inspector_instance.dialect.denormalize_name( schema or self.default_schema_name ) @@ -195,7 +194,6 @@ def get_view_names(self, schema: Optional[str] = None) -> List[str]: def get_columns( self, table_name: str, schema: Optional[str] = None, dblink: str = "" ) -> List[dict]: - denormalized_table_name = self._inspector_instance.dialect.denormalize_name( table_name ) @@ -339,7 +337,6 @@ def get_columns( return columns def get_table_comment(self, table_name: str, schema: Optional[str] = None) -> Dict: - denormalized_table_name = self._inspector_instance.dialect.denormalize_name( table_name ) @@ -411,7 +408,6 @@ def _get_constraint_data( def get_pk_constraint( self, table_name: str, schema: Optional[str] = None, dblink: str = "" ) -> Dict: - denormalized_table_name = self._inspector_instance.dialect.denormalize_name( table_name ) @@ -453,7 +449,6 @@ def get_pk_constraint( def get_foreign_keys( self, table_name: str, schema: Optional[str] = None, dblink: str = "" ) -> List: - denormalized_table_name = self._inspector_instance.dialect.denormalize_name( table_name ) @@ -535,7 +530,6 @@ def fkey_rec(): def get_view_definition( self, view_name: str, schema: Optional[str] = None ) -> Union[str, None]: - denormalized_view_name = self._inspector_instance.dialect.denormalize_name( view_name ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/state/entity_removal_state.py b/metadata-ingestion/src/datahub/ingestion/source/state/entity_removal_state.py index f011aa7bdd19e4..f10340d3a4cf97 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/state/entity_removal_state.py +++ b/metadata-ingestion/src/datahub/ingestion/source/state/entity_removal_state.py @@ -138,7 +138,11 @@ def urn_count(self) -> int: def compute_percent_entities_changed( new_entities: List[str], old_entities: List[str] ) -> float: - (overlap_count, old_count, _,) = _get_entity_overlap_and_cardinalities( + ( + overlap_count, + old_count, + _, + ) = _get_entity_overlap_and_cardinalities( new_entities=new_entities, old_entities=old_entities ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py index 9f011790990ec2..7f8c93fddc9667 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py @@ -2086,7 +2086,6 @@ def parse_custom_sql( def _enrich_database_tables_with_parsed_schemas( self, parsing_result: SqlParsingResult ) -> None: - in_tables_schemas: Dict[ str, Set[str] ] = transform_parsing_result_to_in_tables_schemas(parsing_result) diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_dataproduct.py b/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_dataproduct.py index ce224bde003fd3..bb1c297513de10 100644 --- a/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_dataproduct.py +++ b/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_dataproduct.py @@ -105,7 +105,6 @@ class SimpleAddDatasetDataProduct(AddDatasetDataProduct): """Transformer that adds a specified dataproduct entity for provided dataset as its asset.""" def __init__(self, config: SimpleDatasetDataProductConfig, ctx: PipelineContext): - generic_config = AddDatasetDataProductConfig( get_data_product_to_add=lambda dataset_urn: config.dataset_to_data_product_urns.get( dataset_urn diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_tags.py b/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_tags.py index ef6ef43fa2d7f3..c60f4dca28882d 100644 --- a/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_tags.py +++ b/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_tags.py @@ -67,7 +67,6 @@ def transform_aspect( def handle_end_of_stream( self, ) -> List[Union[MetadataChangeProposalWrapper, MetadataChangeProposalClass]]: - mcps: List[ Union[MetadataChangeProposalWrapper, MetadataChangeProposalClass] ] = [] diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/extract_ownership_from_tags.py b/metadata-ingestion/src/datahub/ingestion/transformer/extract_ownership_from_tags.py index 245a3aa3d9db15..212e018dd64fb7 100644 --- a/metadata-ingestion/src/datahub/ingestion/transformer/extract_ownership_from_tags.py +++ b/metadata-ingestion/src/datahub/ingestion/transformer/extract_ownership_from_tags.py @@ -105,7 +105,6 @@ def convert_tag_as_per_mapping(self, tag: str) -> str: def handle_end_of_stream( self, ) -> Sequence[Union[MetadataChangeProposalWrapper, MetadataChangeProposalClass]]: - return self.owner_mcps def transform_aspect( diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/replace_external_url.py b/metadata-ingestion/src/datahub/ingestion/transformer/replace_external_url.py index 57af10d1040c8a..f6847f234aefe6 100644 --- a/metadata-ingestion/src/datahub/ingestion/transformer/replace_external_url.py +++ b/metadata-ingestion/src/datahub/ingestion/transformer/replace_external_url.py @@ -103,7 +103,6 @@ def create( def transform_aspect( self, entity_urn: str, aspect_name: str, aspect: Optional[Aspect] ) -> Optional[Aspect]: - in_container_properties_aspect: ContainerPropertiesClass = cast( ContainerPropertiesClass, aspect ) diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/tags_to_terms.py b/metadata-ingestion/src/datahub/ingestion/transformer/tags_to_terms.py index 338f191c0829df..7e6125079f16e3 100644 --- a/metadata-ingestion/src/datahub/ingestion/transformer/tags_to_terms.py +++ b/metadata-ingestion/src/datahub/ingestion/transformer/tags_to_terms.py @@ -84,7 +84,6 @@ def get_tags_from_schema_metadata( def transform_aspect( self, entity_urn: str, aspect_name: str, aspect: Optional[Aspect] ) -> Optional[Aspect]: - in_glossary_terms: Optional[GlossaryTermsClass] = cast( Optional[GlossaryTermsClass], aspect ) diff --git a/metadata-ingestion/src/datahub/integrations/assertion/snowflake/metric_sql_generator.py b/metadata-ingestion/src/datahub/integrations/assertion/snowflake/metric_sql_generator.py index 5b079129e0a9c5..facc7d107d1ba7 100644 --- a/metadata-ingestion/src/datahub/integrations/assertion/snowflake/metric_sql_generator.py +++ b/metadata-ingestion/src/datahub/integrations/assertion/snowflake/metric_sql_generator.py @@ -72,7 +72,6 @@ def _(self, assertion: FixedIntervalFreshnessAssertion) -> str: @metric_sql.register def _(self, assertion: RowCountTotalVolumeAssertion) -> str: - # Can not use information schema here due to error - # Data metric function body cannot refer to the non-deterministic function 'CURRENT_DATABASE_MAIN_METASTORE_ID'. diff --git a/metadata-ingestion/src/datahub/specific/dashboard.py b/metadata-ingestion/src/datahub/specific/dashboard.py index 8228dbc011db2f..f57df15914369c 100644 --- a/metadata-ingestion/src/datahub/specific/dashboard.py +++ b/metadata-ingestion/src/datahub/specific/dashboard.py @@ -433,7 +433,6 @@ def set_description(self, description: str) -> "DashboardPatchBuilder": def add_custom_properties( self, custom_properties: Optional[Dict[str, str]] = None ) -> "DashboardPatchBuilder": - if custom_properties: for key, value in custom_properties.items(): self.custom_properties_patch_helper.add_property(key, value) diff --git a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py index 52934f9f72a70e..eb11335624850b 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py +++ b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py @@ -761,7 +761,6 @@ def add_preparsed_query( session_has_temp_tables: bool = True, _is_internal: bool = False, ) -> None: - # Adding tool specific metadata extraction here allows it # to work for both ObservedQuery and PreparsedQuery as # add_preparsed_query it used within add_observed_query. diff --git a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py index 0806d0ec774fe7..61803125c66a58 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py +++ b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py @@ -862,7 +862,6 @@ def _sqlglot_lineage_inner( default_schema: Optional[str] = None, default_dialect: Optional[str] = None, ) -> SqlParsingResult: - if not default_dialect: dialect = get_dialect(schema_resolver.platform) else: diff --git a/metadata-ingestion/src/datahub/sql_parsing/tool_meta_extractor.py b/metadata-ingestion/src/datahub/sql_parsing/tool_meta_extractor.py index cdd35c23e30885..0d85002776e5e2 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/tool_meta_extractor.py +++ b/metadata-ingestion/src/datahub/sql_parsing/tool_meta_extractor.py @@ -79,7 +79,6 @@ def _extract_mode_query(self, entry: QueryLog) -> bool: return True def extract_bi_metadata(self, entry: QueryLog) -> bool: - for tool, meta_extractor in self.known_tool_extractors: try: if meta_extractor(entry): diff --git a/metadata-ingestion/src/datahub/testing/mcp_diff.py b/metadata-ingestion/src/datahub/testing/mcp_diff.py index 95b8e83c7a64a5..5e669a718e9ad3 100644 --- a/metadata-ingestion/src/datahub/testing/mcp_diff.py +++ b/metadata-ingestion/src/datahub/testing/mcp_diff.py @@ -206,7 +206,7 @@ def apply_delta(self, golden: List[Dict[str, Any]]) -> None: """ aspect_diffs = [v for d in self.aspect_changes.values() for v in d.values()] for aspect_diff in aspect_diffs: - for (_, old, new) in aspect_diff.aspects_changed.keys(): + for _, old, new in aspect_diff.aspects_changed.keys(): golden[old.delta_info.idx] = new.delta_info.original indices_to_remove = set() diff --git a/metadata-ingestion/src/datahub/utilities/mapping.py b/metadata-ingestion/src/datahub/utilities/mapping.py index 4ea42d568da635..17023c7b388e76 100644 --- a/metadata-ingestion/src/datahub/utilities/mapping.py +++ b/metadata-ingestion/src/datahub/utilities/mapping.py @@ -43,7 +43,6 @@ def _make_owner_category_list( owner_category_urn: Optional[str], owner_ids: List[str], ) -> List[Dict]: - return [ { "urn": mce_builder.make_owner_urn(owner_id, owner_type), @@ -285,7 +284,6 @@ def convert_to_aspects(self, operation_map: Dict[str, list]) -> Dict[str, Any]: aspect_map[Constants.ADD_TAG_OPERATION] = tag_aspect if Constants.ADD_OWNER_OPERATION in operation_map: - owner_aspect = OwnershipClass( owners=[ OwnerClass( diff --git a/metadata-ingestion/src/datahub/utilities/threaded_iterator_executor.py b/metadata-ingestion/src/datahub/utilities/threaded_iterator_executor.py index 216fa155035d3e..04dee0df422372 100644 --- a/metadata-ingestion/src/datahub/utilities/threaded_iterator_executor.py +++ b/metadata-ingestion/src/datahub/utilities/threaded_iterator_executor.py @@ -19,7 +19,6 @@ def process( args_list: Iterable[Tuple[Any, ...]], max_workers: int, ) -> Generator[T, None, None]: - out_q: queue.Queue[T] = queue.Queue() def _worker_wrapper( diff --git a/metadata-ingestion/tests/integration/azure_ad/test_azure_ad.py b/metadata-ingestion/tests/integration/azure_ad/test_azure_ad.py index 7005bc2e4411bf..024bb62bbe9ce9 100644 --- a/metadata-ingestion/tests/integration/azure_ad/test_azure_ad.py +++ b/metadata-ingestion/tests/integration/azure_ad/test_azure_ad.py @@ -68,7 +68,6 @@ def run_ingest( "datahub.ingestion.source.state_provider.datahub_ingestion_checkpointing_provider.DataHubGraph", mock_datahub_graph, ) as mock_checkpoint: - mock_checkpoint.return_value = mock_datahub_graph mocked_functions_reference( diff --git a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery_queries.py b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery_queries.py index 9290100b0c521c..09c3d28699e5ee 100644 --- a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery_queries.py +++ b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery_queries.py @@ -44,7 +44,6 @@ def _generate_queries_cached_file(tmp_path: Path, queries_json_path: Path) -> No @patch("google.cloud.bigquery.Client") @patch("google.cloud.resourcemanager_v3.ProjectsClient") def test_queries_ingestion(project_client, client, pytestconfig, monkeypatch, tmp_path): - test_resources_dir = pytestconfig.rootpath / "tests/integration/bigquery_v2" mcp_golden_path = f"{test_resources_dir}/bigquery_queries_mcps_golden.json" mcp_output_path = tmp_path / "bigquery_queries_mcps.json" diff --git a/metadata-ingestion/tests/integration/looker/test_looker.py b/metadata-ingestion/tests/integration/looker/test_looker.py index 7238a49cb37d2f..8bbf14709ff9fb 100644 --- a/metadata-ingestion/tests/integration/looker/test_looker.py +++ b/metadata-ingestion/tests/integration/looker/test_looker.py @@ -1047,7 +1047,6 @@ def test_independent_soft_deleted_looks( mocked_client = mock.MagicMock() with mock.patch("looker_sdk.init40") as mock_sdk: - mock_sdk.return_value = mocked_client setup_mock_look(mocked_client) setup_mock_soft_deleted_look(mocked_client) diff --git a/metadata-ingestion/tests/integration/lookml/test_lookml.py b/metadata-ingestion/tests/integration/lookml/test_lookml.py index e4eb564e3e86b7..9a7ed4e0f5df16 100644 --- a/metadata-ingestion/tests/integration/lookml/test_lookml.py +++ b/metadata-ingestion/tests/integration/lookml/test_lookml.py @@ -881,7 +881,6 @@ def test_manifest_parser(pytestconfig: pytest.Config) -> None: @freeze_time(FROZEN_TIME) def test_duplicate_field_ingest(pytestconfig, tmp_path, mock_time): - test_resources_dir = pytestconfig.rootpath / "tests/integration/lookml" mce_out_file = "duplicate_ingest_mces_output.json" diff --git a/metadata-ingestion/tests/integration/okta/test_okta.py b/metadata-ingestion/tests/integration/okta/test_okta.py index 63ef8793cadddc..10148273c93666 100644 --- a/metadata-ingestion/tests/integration/okta/test_okta.py +++ b/metadata-ingestion/tests/integration/okta/test_okta.py @@ -58,14 +58,12 @@ def run_ingest( mocked_functions_reference, recipe, ): - with patch( "datahub.ingestion.source.identity.okta.OktaClient" ) as MockClient, patch( "datahub.ingestion.source.state_provider.datahub_ingestion_checkpointing_provider.DataHubGraph", mock_datahub_graph, ) as mock_checkpoint: - mock_checkpoint.return_value = mock_datahub_graph mocked_functions_reference(MockClient=MockClient) @@ -277,7 +275,6 @@ def overwrite_group_in_mocked_data(test_resources_dir, MockClient): def _init_mock_okta_client( test_resources_dir, MockClient, mock_users_json=None, mock_groups_json=None ): - okta_users_json_file = ( test_resources_dir / "okta_users.json" if mock_users_json is None diff --git a/metadata-ingestion/tests/integration/oracle/common.py b/metadata-ingestion/tests/integration/oracle/common.py index 79dbda8c30f896..9e2cc42ef10256 100644 --- a/metadata-ingestion/tests/integration/oracle/common.py +++ b/metadata-ingestion/tests/integration/oracle/common.py @@ -33,7 +33,6 @@ def scalar(self): @dataclass class MockConstraints: - constraint_name: str = "mock constraint name" constraint_type: str = "P" local_column: str = "mock column name" diff --git a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py index 23b23ecada0d49..1011dcba750271 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py +++ b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py @@ -899,7 +899,6 @@ def test_scan_all_workspaces( mock_time: datetime.datetime, requests_mock: Any, ) -> None: - test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" register_mock_api(request_mock=requests_mock) @@ -949,7 +948,6 @@ def test_extract_reports( mock_time: datetime.datetime, requests_mock: Any, ) -> None: - enable_logging() test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" @@ -1641,7 +1639,6 @@ def test_independent_datasets_extraction( mock_time: datetime.datetime, requests_mock: Any, ) -> None: - test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" register_mock_api( @@ -1737,7 +1734,6 @@ def test_cll_extraction( mock_time: datetime.datetime, requests_mock: Any, ) -> None: - test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" register_mock_api( @@ -1793,7 +1789,6 @@ def test_cll_extraction_flags( mock_time: datetime.datetime, requests_mock: Any, ) -> None: - register_mock_api( request_mock=requests_mock, ) @@ -1804,7 +1799,6 @@ def test_cll_extraction_flags( ) with pytest.raises(Exception, match=pattern): - Pipeline.create( { "run_id": "powerbi-test", diff --git a/metadata-ingestion/tests/integration/qlik_sense/test_qlik_sense.py b/metadata-ingestion/tests/integration/qlik_sense/test_qlik_sense.py index ee1aafb6cf32dc..95f096cc3def35 100644 --- a/metadata-ingestion/tests/integration/qlik_sense/test_qlik_sense.py +++ b/metadata-ingestion/tests/integration/qlik_sense/test_qlik_sense.py @@ -1011,7 +1011,6 @@ def default_config(): def test_qlik_sense_ingest( pytestconfig, tmp_path, requests_mock, mock_websocket_send_request ): - test_resources_dir = pytestconfig.rootpath / "tests/integration/qlik_sense" register_mock_api(request_mock=requests_mock) @@ -1051,7 +1050,6 @@ def test_qlik_sense_ingest( def test_platform_instance_ingest( pytestconfig, tmp_path, requests_mock, mock_websocket_send_request ): - test_resources_dir = pytestconfig.rootpath / "tests/integration/qlik_sense" register_mock_api(request_mock=requests_mock) diff --git a/metadata-ingestion/tests/integration/sigma/test_sigma.py b/metadata-ingestion/tests/integration/sigma/test_sigma.py index 6c01bf6dc80fe7..19fa1448fee598 100644 --- a/metadata-ingestion/tests/integration/sigma/test_sigma.py +++ b/metadata-ingestion/tests/integration/sigma/test_sigma.py @@ -420,7 +420,6 @@ def register_mock_api(request_mock: Any, override_data: dict = {}) -> None: @pytest.mark.integration def test_sigma_ingest(pytestconfig, tmp_path, requests_mock): - test_resources_dir = pytestconfig.rootpath / "tests/integration/sigma" register_mock_api(request_mock=requests_mock) @@ -464,7 +463,6 @@ def test_sigma_ingest(pytestconfig, tmp_path, requests_mock): @pytest.mark.integration def test_platform_instance_ingest(pytestconfig, tmp_path, requests_mock): - test_resources_dir = pytestconfig.rootpath / "tests/integration/sigma" register_mock_api(request_mock=requests_mock) @@ -510,7 +508,6 @@ def test_platform_instance_ingest(pytestconfig, tmp_path, requests_mock): @pytest.mark.integration def test_sigma_ingest_shared_entities(pytestconfig, tmp_path, requests_mock): - test_resources_dir = pytestconfig.rootpath / "tests/integration/sigma" override_data = { diff --git a/metadata-ingestion/tests/integration/snowflake/common.py b/metadata-ingestion/tests/integration/snowflake/common.py index 8f45be96625a45..9e4bb2f0eb634f 100644 --- a/metadata-ingestion/tests/integration/snowflake/common.py +++ b/metadata-ingestion/tests/integration/snowflake/common.py @@ -441,7 +441,6 @@ def default_query_results( # noqa: C901 include_column_lineage=True, ), ): - return [ { "DOWNSTREAM_TABLE_NAME": f"TEST_DB.TEST_SCHEMA.TABLE_{op_idx}", diff --git a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py index 5a5552a78c56fa..79d1d45b60a503 100644 --- a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py +++ b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py @@ -237,7 +237,6 @@ def mock_sdk_client( datasources_side_effect: List[dict], sign_out_side_effect: List[dict], ) -> mock.MagicMock: - mock_client = mock.Mock() mocked_metadata = mock.Mock() mocked_metadata.query.side_effect = side_effect_query_metadata_response @@ -1152,7 +1151,6 @@ def test_site_name_pattern(pytestconfig, tmp_path, mock_datahub_graph): @freeze_time(FROZEN_TIME) @pytest.mark.integration def test_permission_mode_switched_error(pytestconfig, tmp_path, mock_datahub_graph): - with mock.patch( "datahub.ingestion.source.state_provider.datahub_ingestion_checkpointing_provider.DataHubGraph", mock_datahub_graph, diff --git a/metadata-ingestion/tests/integration/unity/test_unity_catalog_ingest.py b/metadata-ingestion/tests/integration/unity/test_unity_catalog_ingest.py index c078f1b77fd1be..b8b0563a1d24e5 100644 --- a/metadata-ingestion/tests/integration/unity/test_unity_catalog_ingest.py +++ b/metadata-ingestion/tests/integration/unity/test_unity_catalog_ingest.py @@ -282,7 +282,6 @@ def register_mock_data(workspace_client): def mock_hive_sql(query): - if query == "DESCRIBE EXTENDED `bronze_kambi`.`bet` betStatusId": return [ ("col_name", "betStatusId"), diff --git a/metadata-ingestion/tests/performance/snowflake/test_snowflake.py b/metadata-ingestion/tests/performance/snowflake/test_snowflake.py index 73b7790b62e9e7..5042c78c2e7b91 100644 --- a/metadata-ingestion/tests/performance/snowflake/test_snowflake.py +++ b/metadata-ingestion/tests/performance/snowflake/test_snowflake.py @@ -16,7 +16,6 @@ def run_test(): - with mock.patch("snowflake.connector.connect") as mock_connect: sf_connection = mock.MagicMock() sf_cursor = mock.MagicMock() diff --git a/metadata-ingestion/tests/unit/api/entities/common/test_serialized_value.py b/metadata-ingestion/tests/unit/api/entities/common/test_serialized_value.py index c9f16bbcef6fc4..a72087376b78a3 100644 --- a/metadata-ingestion/tests/unit/api/entities/common/test_serialized_value.py +++ b/metadata-ingestion/tests/unit/api/entities/common/test_serialized_value.py @@ -10,7 +10,6 @@ class MyTestModel(BaseModel): def test_base_model(): - test_base_model = MyTestModel( test_string_field="test_string_field", test_int_field=42, @@ -31,7 +30,6 @@ def test_base_model(): def test_dictwrapper(): - from datahub.metadata.schema_classes import DatasetPropertiesClass dataset_properties = DatasetPropertiesClass( @@ -58,7 +56,6 @@ def test_dictwrapper(): def test_raw_dictionary(): - test_object = { "test_string_field": "test_string_field", "test_int_field": 42, diff --git a/metadata-ingestion/tests/unit/api/source_helpers/test_incremental_lineage_helper.py b/metadata-ingestion/tests/unit/api/source_helpers/test_incremental_lineage_helper.py index cafca521ae0148..c5c4a378894c32 100644 --- a/metadata-ingestion/tests/unit/api/source_helpers/test_incremental_lineage_helper.py +++ b/metadata-ingestion/tests/unit/api/source_helpers/test_incremental_lineage_helper.py @@ -104,7 +104,6 @@ def test_incremental_table_lineage(tmp_path, pytestconfig): def test_incremental_table_lineage_empty_upstreams(tmp_path, pytestconfig): - urn = make_dataset_urn(platform, "dataset1") aspect = make_lineage_aspect( "dataset1", diff --git a/metadata-ingestion/tests/unit/sql_parsing/test_sql_aggregator.py b/metadata-ingestion/tests/unit/sql_parsing/test_sql_aggregator.py index 0d21936a74d072..6261d2b70905b7 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/test_sql_aggregator.py +++ b/metadata-ingestion/tests/unit/sql_parsing/test_sql_aggregator.py @@ -616,7 +616,6 @@ def test_table_lineage_via_temp_table_disordered_add( @freeze_time(FROZEN_TIME) def test_basic_usage(pytestconfig: pytest.Config) -> None: - frozen_timestamp = parse_user_datetime(FROZEN_TIME) aggregator = SqlParsingAggregator( platform="redshift", diff --git a/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py b/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py index 63de742b201a97..3247a64631da76 100644 --- a/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py +++ b/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py @@ -184,7 +184,6 @@ def test_bigquery_table_sanitasitation(): def test_unquote_and_decode_unicode_escape_seq(): - # Test with a string that starts and ends with quotes and has Unicode escape sequences input_string = '"Hello \\u003cWorld\\u003e"' expected_output = "Hello " diff --git a/metadata-ingestion/tests/unit/test_neo4j_source.py b/metadata-ingestion/tests/unit/test_neo4j_source.py index f410cbcfff3ff2..07f41a37aa36dd 100644 --- a/metadata-ingestion/tests/unit/test_neo4j_source.py +++ b/metadata-ingestion/tests/unit/test_neo4j_source.py @@ -99,7 +99,6 @@ def create_df(self): df = pd.DataFrame(data) return df - def test_get_obj_type(self): assert self.neo.get_obj_type(self.record_1) == "node" assert self.neo.get_obj_type(self.record_2) == "node" diff --git a/metadata-ingestion/tests/unit/test_redshift_lineage.py b/metadata-ingestion/tests/unit/test_redshift_lineage.py index 78b7169a93f3c8..457b56ae1cef44 100644 --- a/metadata-ingestion/tests/unit/test_redshift_lineage.py +++ b/metadata-ingestion/tests/unit/test_redshift_lineage.py @@ -221,7 +221,6 @@ def mock_redshift_connection() -> MagicMock: def mock_graph() -> DataHubGraph: - graph = MagicMock() graph._make_schema_resolver.return_value = SchemaResolver( From cbcfa2faeca5054f5ab993fe20f20a9352f94c03 Mon Sep 17 00:00:00 2001 From: kbartlett Date: Thu, 10 Oct 2024 16:51:53 -0400 Subject: [PATCH 04/31] feat: connector for Neo4j --- metadata-ingestion/docs/sources/neo4j/neo4j.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-ingestion/docs/sources/neo4j/neo4j.md b/metadata-ingestion/docs/sources/neo4j/neo4j.md index a0b5bdfa4693aa..1e37edb5dc3425 100644 --- a/metadata-ingestion/docs/sources/neo4j/neo4j.md +++ b/metadata-ingestion/docs/sources/neo4j/neo4j.md @@ -84,7 +84,7 @@ sink: ### Sample data that is returned from Neo4j. This is the data that is parsed and used to create Nodes, Relationships. -Details can be found here: https://neo4j.com/labs/apoc/4.4/overview/apoc.meta/apoc.meta.schema/ + Example relationship: { From bc4cfcf84a0689879d5095caa37b5a3b7fdd90c3 Mon Sep 17 00:00:00 2001 From: kbartlett Date: Fri, 11 Oct 2024 10:58:03 -0400 Subject: [PATCH 05/31] feat: connector for Neo4j --- metadata-ingestion/docs/sources/neo4j/neo4j.md | 5 ----- 1 file changed, 5 deletions(-) diff --git a/metadata-ingestion/docs/sources/neo4j/neo4j.md b/metadata-ingestion/docs/sources/neo4j/neo4j.md index 1e37edb5dc3425..3af72aea939df2 100644 --- a/metadata-ingestion/docs/sources/neo4j/neo4j.md +++ b/metadata-ingestion/docs/sources/neo4j/neo4j.md @@ -35,10 +35,6 @@ Run the following commands to install the relevant plugin(s): Use the following recipe(s) to get started with ingestion. -_For general pointers on writing and running a recipe, see our [main recipe guide](../README.md#recipes)._ - - -
View All Recipe Configuartion Options @@ -61,7 +57,6 @@ _For general pointers on writing and running a recipe, see our [main recipe guid
-#### `'acryl-datahub[neo4j]'` ```yml source: From bff830a4212bd11b7d5b295a7e289512fcf314c5 Mon Sep 17 00:00:00 2001 From: kbartlett Date: Fri, 18 Oct 2024 14:11:55 -0400 Subject: [PATCH 06/31] feat: connector for Neo4j --- metadata-ingestion/docs/sources/neo4j/neo4j.md | 15 +++------------ .../docs/sources/neo4j/neo4j_recipe.yml | 1 - .../ingestion/source/neo4j/neo4j_source.py | 9 ++++----- 3 files changed, 7 insertions(+), 18 deletions(-) diff --git a/metadata-ingestion/docs/sources/neo4j/neo4j.md b/metadata-ingestion/docs/sources/neo4j/neo4j.md index 3af72aea939df2..7bd3cf470e330a 100644 --- a/metadata-ingestion/docs/sources/neo4j/neo4j.md +++ b/metadata-ingestion/docs/sources/neo4j/neo4j.md @@ -1,15 +1,10 @@ -# Source Name - - -![Certified](https://img.shields.io/badge/support%20status-certified-brightgreen) -![Incubating](https://img.shields.io/badge/support%20status-incubating-blue) -![Testing](https://img.shields.io/badge/support%20status-testing-lightgrey) - ## Integration Details -Neo4j metadata will be ingested into DataHub using Call apoc.meta.data(); The data that is returned will be parsed +Neo4j metadata will be ingested into DataHub using +`CALL apoc.meta.schema() YIELD value UNWIND keys(value) AS key RETURN key, value[key] AS value;` +The data that is returned will be parsed and will be displayed as Nodes and Relationships in DataHub. Each object will be tagged with describing what kind of DataHub object it is. The defaults are 'Node' and 'Relationship'. These tag values can be overwritten in the recipe. @@ -28,10 +23,6 @@ In order to ingest metadata from Neo4j, you will need: Run the following commands to install the relevant plugin(s): -`pip install 'acryl-datahub[neo4j]'` - - -### Configure the Ingestion Recipe(s) Use the following recipe(s) to get started with ingestion. diff --git a/metadata-ingestion/docs/sources/neo4j/neo4j_recipe.yml b/metadata-ingestion/docs/sources/neo4j/neo4j_recipe.yml index af5985b1575e2c..61778ef3decef6 100644 --- a/metadata-ingestion/docs/sources/neo4j/neo4j_recipe.yml +++ b/metadata-ingestion/docs/sources/neo4j/neo4j_recipe.yml @@ -4,7 +4,6 @@ source: uri: 'neo4j+ssc://host:7687' username: 'neo4j' password: 'password' - gms_server: 'http://localhost:8080' node_tag: 'Node' relationship_tag: 'Relationship' environment: 'PROD' diff --git a/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py b/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py index de4790d756dcdb..9cf0c89a098c48 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py @@ -7,7 +7,7 @@ from neo4j import GraphDatabase from pydantic.fields import Field -from datahub.configuration.common import ConfigModel +from datahub.configuration.source_common import EnvConfigMixin from datahub.emitter.mce_builder import ( make_data_platform_urn, make_dataset_urn, @@ -56,11 +56,10 @@ } -class Neo4jConfig(ConfigModel): +class Neo4jConfig(EnvConfigMixin): username: str = Field(default=None, description="Neo4j Username") password: str = Field(default=None, description="Neo4j Password") uri: str = Field(default=None, description="The URI for the Neo4j server") - gms_server: str = Field(default=None, description="Address for the gms server") environment: str = Field(default=None, description="Neo4j env") node_tag: str = Field( default="Node", @@ -79,7 +78,7 @@ class Neo4jSourceReport(SourceReport): obj_created: int = 0 -@platform_name("Neo4j",id="neo4j") +@platform_name("Neo4j", id="neo4j") @config_class(Neo4jConfig) @support_status(SupportStatus.CERTIFIED) class Neo4jSource(Source): @@ -168,7 +167,7 @@ def generate_neo4j_object( def add_tag_to_dataset( self, table_name: str, tag_name: str ) -> MetadataChangeProposalWrapper: - graph = DataHubGraph(DatahubClientConfig(server=self.config.gms_server)) + graph = DataHubGraph(DatahubClientConfig()) dataset_urn = make_dataset_urn( platform=self.config.platform, name=table_name, env=self.config.environment ) From 5ac8fd74be7248969293867d5af0cb5b060e62f7 Mon Sep 17 00:00:00 2001 From: kbartlett Date: Mon, 21 Oct 2024 11:49:54 -0400 Subject: [PATCH 07/31] feat: connector for Neo4j --- metadata-ingestion/setup.py | 3 +++ .../src/datahub/ingestion/source/neo4j/neo4j_source.py | 5 +++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 365da21208ecce..d688a5e01bd884 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -491,6 +491,7 @@ "qlik-sense": sqlglot_lib | {"requests", "websocket-client"}, "sigma": sqlglot_lib | {"requests"}, "sac": sac, + "neo4j": {"pandas", "neo4j"}, } # This is mainly used to exclude plugins from the Docker image. @@ -633,6 +634,7 @@ "qlik-sense", "sigma", "sac", + "neo4j" ] if plugin for dependency in plugins[plugin] @@ -750,6 +752,7 @@ "qlik-sense = datahub.ingestion.source.qlik_sense.qlik_sense:QlikSenseSource", "sigma = datahub.ingestion.source.sigma.sigma:SigmaSource", "sac = datahub.ingestion.source.sac.sac:SACSource", + "neo4j = datahub.ingestion.source.neo4j.neo4j_source:Neo4jSource", ], "datahub.ingestion.transformer.plugins": [ "pattern_cleanup_ownership = datahub.ingestion.transformer.pattern_cleanup_ownership:PatternCleanUpOwnership", diff --git a/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py b/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py index 9cf0c89a098c48..2060007b2c1940 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py @@ -167,7 +167,9 @@ def generate_neo4j_object( def add_tag_to_dataset( self, table_name: str, tag_name: str ) -> MetadataChangeProposalWrapper: - graph = DataHubGraph(DatahubClientConfig()) + graph = DataHubGraph( + DatahubClientConfig(server=self.ctx.pipeline_config.sink.config["server"]) + ) dataset_urn = make_dataset_urn( platform=self.config.platform, name=table_name, env=self.config.environment ) @@ -316,7 +318,6 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: yield MetadataWorkUnit( id=row["key"], mcp_raw=self.generate_neo4j_object( - # mcp=self.generate_neo4j_object( columns=row["property_data_types"], dataset=row["key"], platform=self.config.platform, From 9360ce69df94ab2bf200412dec05e0e70b3a295d Mon Sep 17 00:00:00 2001 From: kbartlett Date: Mon, 21 Oct 2024 11:56:42 -0400 Subject: [PATCH 08/31] feat: connector for Neo4j --- .../platformresource/platform_resource.py | 194 ++++++++++-------- 1 file changed, 110 insertions(+), 84 deletions(-) diff --git a/metadata-ingestion/src/datahub/api/entities/platformresource/platform_resource.py b/metadata-ingestion/src/datahub/api/entities/platformresource/platform_resource.py index 9346c9b002a4ac..0f7b10a067053a 100644 --- a/metadata-ingestion/src/datahub/api/entities/platformresource/platform_resource.py +++ b/metadata-ingestion/src/datahub/api/entities/platformresource/platform_resource.py @@ -1,5 +1,5 @@ import logging -from typing import Dict, Iterable, List, Optional, Union +from typing import Callable, Dict, Iterable, List, Optional, Tuple, Type, Union, cast from avrogen.dict_wrapper import DictWrapper from pydantic import BaseModel @@ -14,7 +14,14 @@ from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.mcp_builder import DatahubKey from datahub.ingestion.graph.client import DataHubGraph -from datahub.metadata.urns import PlatformResourceUrn +from datahub.metadata.urns import DataPlatformUrn, PlatformResourceUrn, Urn +from datahub.utilities.openapi_utils import OpenAPIGraphClient +from datahub.utilities.search_utils import ( + ElasticDocumentQuery, + ElasticsearchQueryBuilder, + LogicalOperator, + SearchField, +) logger = logging.getLogger(__name__) @@ -69,70 +76,75 @@ def to_resource_info(self) -> models.PlatformResourceInfoClass: ) -class OpenAPIGraphClient: - ENTITY_KEY_ASPECT_MAP = { - aspect_type.ASPECT_INFO.get("keyForEntity"): name - for name, aspect_type in models.ASPECT_NAME_MAP.items() - if aspect_type.ASPECT_INFO.get("keyForEntity") - } +class DataPlatformInstanceUrn: + """ + A simple implementation of a URN class for DataPlatformInstance. + Since this is not present in the URN registry, we need to implement it here. + """ - def __init__(self, graph: DataHubGraph): - self.graph = graph - self.openapi_base = graph._gms_server.rstrip("/") + "/openapi/v3" + @staticmethod + def create_from_id(platform_instance_urn: str) -> Urn: + if platform_instance_urn.startswith("urn:li:platformInstance:"): + string_urn = platform_instance_urn + else: + string_urn = f"urn:li:platformInstance:{platform_instance_urn}" + return Urn.from_string(string_urn) - def scroll_urns_by_filter( - self, - entity_type: str, - extra_or_filters: List[Dict[str, str]], - extra_and_filters: List[Dict[str, str]] = [], - ) -> Iterable[str]: - """ - Scroll through all urns that match the given filters - """ - key_aspect = self.ENTITY_KEY_ASPECT_MAP.get(entity_type) - assert key_aspect, f"No key aspect found for entity type {entity_type}" - if extra_or_filters and extra_and_filters: - raise ValueError( - "Only one of extra_or_filters and extra_and_filters should be provided" - ) +class UrnSearchField(SearchField): + """ + A search field that supports URN values. + TODO: Move this to search_utils after we make this more generic. + """ - count = 1000 - query = ( - " OR ".join( - [ - f"{filter['field']}:\"{filter['value']}\"" - for filter in extra_or_filters - ] - ) - if extra_or_filters - else " AND ".join( - [ - f"{filter['field']}:\"{filter['value']}\"" - for filter in extra_and_filters - ] - ) + def __init__(self, field_name: str, urn_value_extractor: Callable[[str], Urn]): + self.urn_value_extractor = urn_value_extractor + super().__init__(field_name) + + def get_search_value(self, value: str) -> str: + return str(self.urn_value_extractor(value)) + + +class PlatformResourceSearchField(SearchField): + def __init__(self, field_name: str): + super().__init__(field_name) + + @classmethod + def from_search_field( + cls, search_field: SearchField + ) -> "PlatformResourceSearchField": + # pretends to be a class method, but just returns the input + return search_field # type: ignore + + +class PlatformResourceSearchFields: + PRIMARY_KEY = PlatformResourceSearchField("primaryKey") + RESOURCE_TYPE = PlatformResourceSearchField("resourceType") + SECONDARY_KEYS = PlatformResourceSearchField("secondaryKeys") + PLATFORM = PlatformResourceSearchField.from_search_field( + UrnSearchField( + field_name="platform.keyword", + urn_value_extractor=DataPlatformUrn.create_from_id, ) - scroll_id = None - while True: - response = self.graph._get_generic( - self.openapi_base + f"/entity/{entity_type.lower()}", - params={ - "systemMetadata": "false", - "includeSoftDelete": "false", - "skipCache": "false", - "aspects": [key_aspect], - "scrollId": scroll_id, - "count": count, - "query": query, - }, - ) - entities = response.get("entities", []) - scroll_id = response.get("scrollId") - for entity in entities: - yield entity["urn"] - if not scroll_id: - break + ) + PLATFORM_INSTANCE = PlatformResourceSearchField.from_search_field( + UrnSearchField( + field_name="platformInstance.keyword", + urn_value_extractor=DataPlatformInstanceUrn.create_from_id, + ) + ) + + +class ElasticPlatformResourceQuery(ElasticDocumentQuery[PlatformResourceSearchField]): + def __init__(self): + super().__init__() + + @classmethod + def create_from( + cls: Type["ElasticPlatformResourceQuery"], + *args: Tuple[Union[str, PlatformResourceSearchField], str], + ) -> "ElasticPlatformResourceQuery": + return cast(ElasticPlatformResourceQuery, super().create_from(*args)) class PlatformResource(BaseModel): @@ -146,6 +158,12 @@ def remove( cls, key: PlatformResourceKey, ) -> "PlatformResource": + """ + Creates a PlatformResource object with the removed status set to True. + Removed PlatformResource objects are used to soft-delete resources from + the graph. + To hard-delete a resource, use the delete method. + """ return cls( id=key.id, removed=True, @@ -239,28 +257,38 @@ def from_datahub( @staticmethod def search_by_key( - graph_client: DataHubGraph, key: str, primary: bool = True + graph_client: DataHubGraph, + key: str, + primary: bool = True, + is_exact: bool = True, ) -> Iterable["PlatformResource"]: - extra_or_filters = [] - extra_or_filters.append( - { - "field": "primaryKey", - "condition": "EQUAL", - "value": key, - } + """ + Searches for PlatformResource entities by primary or secondary key. + + :param graph_client: DataHubGraph client + :param key: The key to search for + :param primary: Whether to search for primary only or expand the search + to secondary keys (default: True) + :param is_exact: Whether to search for an exact match (default: True) + :return: An iterable of PlatformResource objects + """ + + elastic_platform_resource_group = ( + ElasticPlatformResourceQuery.create_from() + .group(LogicalOperator.OR) + .add_field_match( + PlatformResourceSearchFields.PRIMARY_KEY, key, is_exact=is_exact + ) ) if not primary: # we expand the search to secondary keys - extra_or_filters.append( - { - "field": "secondaryKeys", - "condition": "EQUAL", - "value": key, - } + elastic_platform_resource_group.add_field_match( + PlatformResourceSearchFields.SECONDARY_KEYS, key, is_exact=is_exact ) + query = elastic_platform_resource_group.end() openapi_client = OpenAPIGraphClient(graph_client) for urn in openapi_client.scroll_urns_by_filter( entity_type="platformResource", - extra_or_filters=extra_or_filters, + query=query, ): platform_resource = PlatformResource.from_datahub(graph_client, urn) if platform_resource: @@ -272,18 +300,16 @@ def delete(self, graph_client: DataHubGraph, hard: bool = True) -> None: @staticmethod def search_by_filters( graph_client: DataHubGraph, - and_filters: List[Dict[str, str]] = [], - or_filters: List[Dict[str, str]] = [], + query: Union[ + ElasticPlatformResourceQuery, + ElasticDocumentQuery, + ElasticsearchQueryBuilder, + ], ) -> Iterable["PlatformResource"]: - if and_filters and or_filters: - raise ValueError( - "Only one of and_filters and or_filters should be provided" - ) openapi_client = OpenAPIGraphClient(graph_client) for urn in openapi_client.scroll_urns_by_filter( entity_type="platformResource", - extra_or_filters=or_filters if or_filters else [], - extra_and_filters=and_filters if and_filters else [], + query=query, ): platform_resource = PlatformResource.from_datahub(graph_client, urn) if platform_resource: From 3153e6aae30577b3454036b4a6b5beaf50871e1f Mon Sep 17 00:00:00 2001 From: Andrew Sikowitz Date: Fri, 18 Oct 2024 12:01:39 -0700 Subject: [PATCH 09/31] feat(ingest/transformer/domain): Add support for on conflict do nothing to dataset domain transformers (#11649) --- .../docs/transformer/dataset_transformer.md | 28 +++---- .../src/datahub/ingestion/graph/client.py | 1 + .../ingestion/transformer/dataset_domain.py | 41 ++++++---- .../tests/unit/test_transform_dataset.py | 76 +++++++++++++++++++ 4 files changed, 119 insertions(+), 27 deletions(-) diff --git a/metadata-ingestion/docs/transformer/dataset_transformer.md b/metadata-ingestion/docs/transformer/dataset_transformer.md index d48c6d2c1ab5b4..66274ce64a8d29 100644 --- a/metadata-ingestion/docs/transformer/dataset_transformer.md +++ b/metadata-ingestion/docs/transformer/dataset_transformer.md @@ -122,12 +122,13 @@ transformers: ``` ## Simple Add Dataset ownership ### Config Details -| Field | Required | Type | Default | Description | -|--------------------|----------|--------------|-------------|---------------------------------------------------------------------| -| `owner_urns` | ✅ | list[string] | | List of owner urns. | -| `ownership_type` | | string | "DATAOWNER" | ownership type of the owners (either as enum or ownership type urn) | -| `replace_existing` | | boolean | `false` | Whether to remove ownership from entity sent by ingestion source. | -| `semantics` | | enum | `OVERWRITE` | Whether to OVERWRITE or PATCH the entity present on DataHub GMS. | +| Field | Required | Type | Default | Description | +|--------------------|----------|--------------|-------------|------------------------------------------------------------------------------------------------------------| +| `owner_urns` | ✅ | list[string] | | List of owner urns. | +| `ownership_type` | | string | "DATAOWNER" | ownership type of the owners (either as enum or ownership type urn) | +| `replace_existing` | | boolean | `false` | Whether to remove ownership from entity sent by ingestion source. | +| `semantics` | | enum | `OVERWRITE` | Whether to OVERWRITE or PATCH the entity present on DataHub GMS. | +| `on_conflict` | | enum | `DO_UPDATE` | Whether to make changes if domains already exist. If set to DO_NOTHING, `semantics` setting is irrelevant. | For transformer behaviour on `replace_existing` and `semantics`, please refer section [Relationship Between replace_existing And semantics](#relationship-between-replace_existing-and-semantics). @@ -191,13 +192,14 @@ transformers: ## Pattern Add Dataset ownership ### Config Details -| Field | Required | Type | Default | Description | -|--------------------|----------|----------------------|-------------|-----------------------------------------------------------------------------------------| -| `owner_pattern` | ✅ | map[regx, list[urn]] | | entity urn with regular expression and list of owners urn apply to matching entity urn. | -| `ownership_type` | | string | "DATAOWNER" | ownership type of the owners (either as enum or ownership type urn) | -| `replace_existing` | | boolean | `false` | Whether to remove owners from entity sent by ingestion source. | -| `semantics` | | enum | `OVERWRITE` | Whether to OVERWRITE or PATCH the entity present on DataHub GMS. | -| `is_container` | | bool | `false` | Whether to also consider a container or not. If true, then ownership will be attached to both the dataset and its container. | +| Field | Required | Type | Default | Description | +|--------------------|----------|----------------------|-------------|------------------------------------------------------------------------------------------------------------------------------| +| `owner_pattern` | ✅ | map[regx, list[urn]] | | entity urn with regular expression and list of owners urn apply to matching entity urn. | +| `ownership_type` | | string | "DATAOWNER" | ownership type of the owners (either as enum or ownership type urn) | +| `replace_existing` | | boolean | `false` | Whether to remove owners from entity sent by ingestion source. | +| `semantics` | | enum | `OVERWRITE` | Whether to OVERWRITE or PATCH the entity present on DataHub GMS. | +| `is_container` | | bool | `false` | Whether to also consider a container or not. If true, then ownership will be attached to both the dataset and its container. | +| `on_conflict` | | enum | `DO_UPDATE` | Whether to make changes if domains already exist. If set to DO_NOTHING, `semantics` setting is irrelevant. | let’s suppose we’d like to append a series of users who we know to own a different dataset from a data source but aren't detected during normal ingestion. To do so, we can use the `pattern_add_dataset_ownership` module that’s included in the ingestion framework. This will match the pattern to `urn` of the dataset and assign the respective owners. diff --git a/metadata-ingestion/src/datahub/ingestion/graph/client.py b/metadata-ingestion/src/datahub/ingestion/graph/client.py index e8fae6254ae885..1d2528a24c4e57 100644 --- a/metadata-ingestion/src/datahub/ingestion/graph/client.py +++ b/metadata-ingestion/src/datahub/ingestion/graph/client.py @@ -351,6 +351,7 @@ def get_tags(self, entity_urn: str) -> Optional[GlobalTagsClass]: def get_glossary_terms(self, entity_urn: str) -> Optional[GlossaryTermsClass]: return self.get_aspect(entity_urn=entity_urn, aspect_type=GlossaryTermsClass) + @functools.lru_cache(maxsize=1) def get_domain(self, entity_urn: str) -> Optional[DomainsClass]: return self.get_aspect(entity_urn=entity_urn, aspect_type=DomainsClass) diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/dataset_domain.py b/metadata-ingestion/src/datahub/ingestion/transformer/dataset_domain.py index 6a838248152650..6b78b71eaa78e9 100644 --- a/metadata-ingestion/src/datahub/ingestion/transformer/dataset_domain.py +++ b/metadata-ingestion/src/datahub/ingestion/transformer/dataset_domain.py @@ -1,6 +1,8 @@ import logging +from enum import auto from typing import Callable, Dict, List, Optional, Sequence, Union, cast +from datahub.configuration._config_enum import ConfigEnum from datahub.configuration.common import ( ConfigurationError, KeyValuePattern, @@ -23,6 +25,13 @@ logger = logging.getLogger(__name__) +class TransformerOnConflict(ConfigEnum): + """Describes the behavior of the transformer when writing an aspect that already exists.""" + + DO_UPDATE = auto() # On conflict, apply the new aspect + DO_NOTHING = auto() # On conflict, do not apply the new aspect + + class AddDatasetDomainSemanticsConfig(TransformerSemanticsConfigModel): get_domains_to_add: Union[ Callable[[str], DomainsClass], @@ -32,10 +41,12 @@ class AddDatasetDomainSemanticsConfig(TransformerSemanticsConfigModel): _resolve_domain_fn = pydantic_resolve_key("get_domains_to_add") is_container: bool = False + on_conflict: TransformerOnConflict = TransformerOnConflict.DO_UPDATE class SimpleDatasetDomainSemanticsConfig(TransformerSemanticsConfigModel): domains: List[str] + on_conflict: TransformerOnConflict = TransformerOnConflict.DO_UPDATE class PatternDatasetDomainSemanticsConfig(TransformerSemanticsConfigModel): @@ -80,12 +91,13 @@ def get_domain_class( @staticmethod def _merge_with_server_domains( - graph: DataHubGraph, urn: str, mce_domain: Optional[DomainsClass] + graph: Optional[DataHubGraph], urn: str, mce_domain: Optional[DomainsClass] ) -> Optional[DomainsClass]: if not mce_domain or not mce_domain.domains: # nothing to add, no need to consult server return None + assert graph server_domain = graph.get_domain(entity_urn=urn) if server_domain: # compute patch @@ -155,7 +167,7 @@ def transform_aspect( self, entity_urn: str, aspect_name: str, aspect: Optional[Aspect] ) -> Optional[Aspect]: in_domain_aspect: DomainsClass = cast(DomainsClass, aspect) - domain_aspect = DomainsClass(domains=[]) + domain_aspect: DomainsClass = DomainsClass(domains=[]) # Check if we have received existing aspect if in_domain_aspect is not None and self.config.replace_existing is False: domain_aspect.domains.extend(in_domain_aspect.domains) @@ -164,16 +176,18 @@ def transform_aspect( domain_aspect.domains.extend(domain_to_add.domains) - if self.config.semantics == TransformerSemantics.PATCH: - assert self.ctx.graph - patch_domain_aspect: Optional[ - DomainsClass - ] = AddDatasetDomain._merge_with_server_domains( - self.ctx.graph, entity_urn, domain_aspect - ) - return cast(Optional[Aspect], patch_domain_aspect) - - return cast(Optional[Aspect], domain_aspect) + final_aspect: Optional[DomainsClass] = domain_aspect + if domain_aspect.domains: + if self.config.on_conflict == TransformerOnConflict.DO_NOTHING: + assert self.ctx.graph + server_domain = self.ctx.graph.get_domain(entity_urn) + if server_domain and server_domain.domains: + return None + if self.config.semantics == TransformerSemantics.PATCH: + final_aspect = AddDatasetDomain._merge_with_server_domains( + self.ctx.graph, entity_urn, domain_aspect + ) + return cast(Optional[Aspect], final_aspect) class SimpleAddDatasetDomain(AddDatasetDomain): @@ -186,8 +200,7 @@ def __init__( domains = AddDatasetDomain.get_domain_class(ctx.graph, config.domains) generic_config = AddDatasetDomainSemanticsConfig( get_domains_to_add=lambda _: domains, - semantics=config.semantics, - replace_existing=config.replace_existing, + **config.dict(exclude={"domains"}), ) super().__init__(generic_config, ctx) diff --git a/metadata-ingestion/tests/unit/test_transform_dataset.py b/metadata-ingestion/tests/unit/test_transform_dataset.py index 2e2e85b5d18113..4e9a38cb37ae63 100644 --- a/metadata-ingestion/tests/unit/test_transform_dataset.py +++ b/metadata-ingestion/tests/unit/test_transform_dataset.py @@ -56,6 +56,7 @@ from datahub.ingestion.transformer.dataset_domain import ( PatternAddDatasetDomain, SimpleAddDatasetDomain, + TransformerOnConflict, ) from datahub.ingestion.transformer.dataset_domain_based_on_tags import ( DatasetTagDomainMapper, @@ -2498,6 +2499,81 @@ def fake_get_domain(entity_urn: str) -> models.DomainsClass: assert server_domain in transformed_aspect.domains +def test_simple_add_dataset_domain_on_conflict_do_nothing( + pytestconfig, tmp_path, mock_time, mock_datahub_graph_instance +): + acryl_domain = builder.make_domain_urn("acryl.io") + datahub_domain = builder.make_domain_urn("datahubproject.io") + server_domain = builder.make_domain_urn("test.io") + + pipeline_context = PipelineContext(run_id="transformer_pipe_line") + pipeline_context.graph = mock_datahub_graph_instance + + # Return fake aspect to simulate server behaviour + def fake_get_domain(entity_urn: str) -> models.DomainsClass: + return models.DomainsClass(domains=[server_domain]) + + pipeline_context.graph.get_domain = fake_get_domain # type: ignore + + output = run_dataset_transformer_pipeline( + transformer_type=SimpleAddDatasetDomain, + aspect=models.DomainsClass(domains=[datahub_domain]), + config={ + "replace_existing": False, + "semantics": TransformerSemantics.PATCH, + "domains": [acryl_domain], + "on_conflict": TransformerOnConflict.DO_NOTHING, + }, + pipeline_context=pipeline_context, + ) + + assert len(output) == 1 + assert output[0] is not None + assert output[0].record is not None + assert isinstance(output[0].record, EndOfStream) + + +def test_simple_add_dataset_domain_on_conflict_do_nothing_no_conflict( + pytestconfig, tmp_path, mock_time, mock_datahub_graph_instance +): + acryl_domain = builder.make_domain_urn("acryl.io") + datahub_domain = builder.make_domain_urn("datahubproject.io") + irrelevant_domain = builder.make_domain_urn("test.io") + + pipeline_context = PipelineContext(run_id="transformer_pipe_line") + pipeline_context.graph = mock_datahub_graph_instance + + # Return fake aspect to simulate server behaviour + def fake_get_domain(entity_urn: str) -> models.DomainsClass: + return models.DomainsClass(domains=[]) + + pipeline_context.graph.get_domain = fake_get_domain # type: ignore + + output = run_dataset_transformer_pipeline( + transformer_type=SimpleAddDatasetDomain, + aspect=models.DomainsClass(domains=[datahub_domain]), + config={ + "replace_existing": False, + "semantics": TransformerSemantics.PATCH, + "domains": [acryl_domain], + "on_conflict": TransformerOnConflict.DO_NOTHING, + }, + pipeline_context=pipeline_context, + ) + + assert len(output) == 2 + assert output[0] is not None + assert output[0].record is not None + assert isinstance(output[0].record, MetadataChangeProposalWrapper) + assert output[0].record.aspect is not None + assert isinstance(output[0].record.aspect, models.DomainsClass) + transformed_aspect = cast(models.DomainsClass, output[0].record.aspect) + assert len(transformed_aspect.domains) == 2 + assert datahub_domain in transformed_aspect.domains + assert acryl_domain in transformed_aspect.domains + assert irrelevant_domain not in transformed_aspect.domains + + def test_pattern_add_dataset_domain_aspect_name(mock_datahub_graph_instance): pipeline_context: PipelineContext = PipelineContext( run_id="test_simple_add_dataset_domain" From 20aa2233c28b18ef6481a09fee8c6ee132b54e3e Mon Sep 17 00:00:00 2001 From: Jay Feldman <8128360+feldjay@users.noreply.github.com> Date: Fri, 18 Oct 2024 15:58:52 -0400 Subject: [PATCH 10/31] fix(ingest/looker): Remove bad imports from looker_common (#11663) --- .../src/datahub/ingestion/source/looker/looker_common.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py index df855ede985313..ec32c4c6d24f24 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py @@ -928,7 +928,6 @@ def from_api( # noqa: C901 reporter: SourceReport, source_config: LookerDashboardSourceConfig, ) -> Optional["LookerExplore"]: # noqa: C901 - from datahub.ingestion.source.looker.lookml_source import _BASE_PROJECT_NAME try: explore = client.lookml_model_explore(model, explore_name) @@ -1190,7 +1189,6 @@ def _to_metadata_events( # noqa: C901 ) -> Optional[List[Union[MetadataChangeEvent, MetadataChangeProposalWrapper]]]: # We only generate MCE-s for explores that contain from clauses and do NOT contain joins # All other explores (passthrough explores and joins) end in correct resolution of lineage, and don't need additional nodes in the graph. - from datahub.ingestion.source.looker.lookml_source import _BASE_PROJECT_NAME dataset_snapshot = DatasetSnapshot( urn=self.get_explore_urn(config), From c4a8001495706894fa753249e91535c9505a0a79 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Fri, 18 Oct 2024 13:05:06 -0700 Subject: [PATCH 11/31] feat(ingest/looker): include project name in model/explore properties (#11664) Co-authored-by: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> --- .../ingestion/source/looker/looker_common.py | 16 ++-- .../ingestion/source/looker/looker_source.py | 41 ++++++---- .../looker/golden_looker_mces.json | 7 ++ .../looker/golden_test_allow_ingest.json | 4 + ...olden_test_external_project_view_mces.json | 4 + .../looker/golden_test_file_path_ingest.json | 4 + ...olden_test_folder_path_pattern_ingest.json | 4 + .../golden_test_independent_look_ingest.json | 82 +++++++++++-------- .../looker/golden_test_ingest.json | 4 + .../looker/golden_test_ingest_joins.json | 4 + .../golden_test_ingest_unaliased_joins.json | 4 + ...en_test_non_personal_independent_look.json | 7 ++ .../looker_mces_golden_deleted_stateful.json | 16 ++-- .../looker/looker_mces_usage_history.json | 4 + 14 files changed, 135 insertions(+), 66 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py index ec32c4c6d24f24..1cd3c88a527cbd 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py @@ -1201,15 +1201,19 @@ def _to_metadata_events( # noqa: C901 dataset_snapshot.aspects.append(browse_paths) dataset_snapshot.aspects.append(StatusClass(removed=False)) - custom_properties = {} - if self.label is not None: - custom_properties["looker.explore.label"] = str(self.label) - if self.source_file is not None: - custom_properties["looker.explore.file"] = str(self.source_file) + custom_properties = { + "project": self.project_name, + "model": self.model_name, + "looker.explore.label": self.label, + "looker.explore.name": self.name, + "looker.explore.file": self.source_file, + } dataset_props = DatasetPropertiesClass( name=str(self.label) if self.label else LookerUtil._display_name(self.name), description=self.description, - customProperties=custom_properties, + customProperties={ + k: str(v) for k, v in custom_properties.items() if v is not None + }, ) dataset_props.externalUrl = self._get_url(base_url) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py index 165d80b707000d..cd8ccb8217257c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py @@ -139,26 +139,21 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase): """ platform = "looker" - source_config: LookerDashboardSourceConfig - reporter: LookerDashboardSourceReport - user_registry: LookerUserRegistry - reachable_look_registry: Set[ - str - ] # Keep track of look-id which are reachable from Dashboard def __init__(self, config: LookerDashboardSourceConfig, ctx: PipelineContext): super().__init__(config, ctx) - self.source_config = config - self.reporter = LookerDashboardSourceReport() + self.source_config: LookerDashboardSourceConfig = config + self.reporter: LookerDashboardSourceReport = LookerDashboardSourceReport() self.looker_api: LookerAPI = LookerAPI(self.source_config) - self.user_registry = LookerUserRegistry(self.looker_api) - self.explore_registry = LookerExploreRegistry( + self.user_registry: LookerUserRegistry = LookerUserRegistry(self.looker_api) + self.explore_registry: LookerExploreRegistry = LookerExploreRegistry( self.looker_api, self.reporter, self.source_config ) self.reporter._looker_explore_registry = self.explore_registry self.reporter._looker_api = self.looker_api - self.reachable_look_registry = set() + # Keep track of look-id which are reachable from Dashboard + self.reachable_look_registry: Set[str] = set() # (model, explore) -> list of charts/looks/dashboards that reference this explore # The list values are used purely for debugging purposes. @@ -867,21 +862,31 @@ def _make_explore_metadata_events( ) -> Iterable[ Union[MetadataChangeEvent, MetadataChangeProposalWrapper, MetadataWorkUnit] ]: - if self.source_config.emit_used_explores_only: - explores_to_fetch = list(self.reachable_explores.keys()) - else: + if not self.source_config.emit_used_explores_only: explores_to_fetch = list(self.list_all_explores()) + else: + # We don't keep track of project names for each explore right now. + # Because project names are just used for a custom property, it's + # fine to set them to None. + # TODO: Track project names for each explore. + explores_to_fetch = [ + (None, model, explore) + for (model, explore) in self.reachable_explores.keys() + ] explores_to_fetch.sort() processed_models: List[str] = [] - for model, _ in explores_to_fetch: + for project_name, model, _ in explores_to_fetch: if model not in processed_models: model_key = gen_model_key(self.source_config, model) yield from gen_containers( container_key=model_key, name=model, sub_types=[BIContainerSubTypes.LOOKML_MODEL], + extra_properties=( + {"project": project_name} if project_name is not None else None + ), ) yield MetadataChangeProposalWrapper( entityUrn=model_key.as_urn(), @@ -895,7 +900,7 @@ def _make_explore_metadata_events( self.reporter.total_explores = len(explores_to_fetch) for future in BackpressureAwareExecutor.map( self.fetch_one_explore, - ((model, explore) for (model, explore) in explores_to_fetch), + ((model, explore) for (_project, model, explore) in explores_to_fetch), max_workers=self.source_config.max_threads, ): events, explore_id, start_time, end_time = future.result() @@ -906,7 +911,7 @@ def _make_explore_metadata_events( f"Running time of fetch_one_explore for {explore_id}: {(end_time - start_time).total_seconds()}" ) - def list_all_explores(self) -> Iterable[Tuple[str, str]]: + def list_all_explores(self) -> Iterable[Tuple[Optional[str], str, str]]: # returns a list of (model, explore) tuples for model in self.looker_api.all_lookml_models(): @@ -915,7 +920,7 @@ def list_all_explores(self) -> Iterable[Tuple[str, str]]: for explore in model.explores: if explore.name is None: continue - yield (model.name, explore.name) + yield (model.project_name, model.name, explore.name) def fetch_one_explore( self, model: str, explore: str diff --git a/metadata-ingestion/tests/integration/looker/golden_looker_mces.json b/metadata-ingestion/tests/integration/looker/golden_looker_mces.json index 5cac7b1bb73b19..a9c445b5986efe 100644 --- a/metadata-ingestion/tests/integration/looker/golden_looker_mces.json +++ b/metadata-ingestion/tests/integration/looker/golden_looker_mces.json @@ -11,6 +11,7 @@ "description": "lorem ipsum", "charts": [], "datasets": [], + "dashboards": [], "lastModified": { "created": { "time": 1586847600000, @@ -440,7 +441,10 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { + "project": "lkml_samples", + "model": "bogus data", "looker.explore.label": "My Explore View", + "looker.explore.name": "my_view", "looker.explore.file": "test_source_file.lkml" }, "externalUrl": "https://looker.company.com/explore/bogus data/my_view", @@ -616,7 +620,10 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { + "project": "lkml_samples", + "model": "data", "looker.explore.label": "My Explore View", + "looker.explore.name": "my_view", "looker.explore.file": "test_source_file.lkml" }, "externalUrl": "https://looker.company.com/explore/data/my_view", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json index 24a738a815cda8..af9c62a2a41803 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json @@ -11,6 +11,7 @@ "description": "lorem ipsum", "charts": [], "datasets": [], + "dashboards": [], "lastModified": { "created": { "time": 1586847600000, @@ -282,7 +283,10 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { + "project": "lkml_samples", + "model": "data", "looker.explore.label": "My Explore View", + "looker.explore.name": "my_view", "looker.explore.file": "test_source_file.lkml" }, "externalUrl": "https://looker.company.com/explore/data/my_view", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json b/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json index b1460779da4f5f..b89bc356b48fdc 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json @@ -202,6 +202,7 @@ "urn:li:chart:(looker,dashboard_elements.2)" ], "datasets": [], + "dashboards": [], "lastModified": { "created": { "time": 1586847600000, @@ -520,7 +521,10 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { + "project": "looker_hub", + "model": "data", "looker.explore.label": "My Explore View", + "looker.explore.name": "my_view", "looker.explore.file": "test_source_file.lkml" }, "externalUrl": "https://looker.company.com/explore/data/my_view", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json index 74400b9b5cc56b..810fefd8f6cb85 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json @@ -202,6 +202,7 @@ "urn:li:chart:(looker,dashboard_elements.2)" ], "datasets": [], + "dashboards": [], "lastModified": { "created": { "time": 1586847600000, @@ -520,7 +521,10 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { + "project": "looker_hub", + "model": "data", "looker.explore.label": "My Explore View", + "looker.explore.name": "my_view", "looker.explore.file": "test_source_file.lkml" }, "externalUrl": "https://looker.company.com/explore/data/my_view", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_folder_path_pattern_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_folder_path_pattern_ingest.json index 89241fb52fb634..3d78397f54a235 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_folder_path_pattern_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_folder_path_pattern_ingest.json @@ -287,6 +287,7 @@ "description": "third", "charts": [], "datasets": [], + "dashboards": [], "lastModified": { "created": { "time": 1586847600000, @@ -613,7 +614,10 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { + "project": "lkml_samples", + "model": "data", "looker.explore.label": "My Explore View", + "looker.explore.name": "my_view", "looker.explore.file": "test_source_file.lkml" }, "externalUrl": "https://looker.company.com/explore/data/my_view", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json index f178e97e78fa02..5a540e61e768d7 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json @@ -210,6 +210,7 @@ "urn:li:chart:(looker,dashboard_elements.2)" ], "datasets": [], + "dashboards": [], "lastModified": { "created": { "time": 1586847600000, @@ -1107,12 +1108,12 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.BrowsePaths": { "paths": [ - "/Explore/data" + "/Explore/sales_model" ] } }, @@ -1124,10 +1125,13 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { + "project": "lkml_samples", + "model": "sales_model", "looker.explore.label": "My Explore View", + "looker.explore.name": "sales_explore", "looker.explore.file": "test_source_file.lkml" }, - "externalUrl": "https://looker.company.com/explore/data/my_view", + "externalUrl": "https://looker.company.com/explore/sales_model/sales_explore", "name": "My Explore View", "description": "lorem ipsum", "tags": [] @@ -1149,7 +1153,7 @@ }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "my_view", + "schemaName": "sales_explore", "platform": "urn:li:dataPlatform:looker", "version": 0, "created": { @@ -1204,7 +1208,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1223,12 +1227,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", "changeType": "UPSERT", "aspectName": "embed", "aspect": { "json": { - "renderUrl": "https://looker.company.com/embed/explore/data/my_view" + "renderUrl": "https://looker.company.com/embed/explore/sales_model/sales_explore" } }, "systemMetadata": { @@ -1240,12 +1244,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:59a5aa45397364e6882e793f1bc77b42" + "container": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5" } }, "systemMetadata": { @@ -1257,7 +1261,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -1267,8 +1271,8 @@ "id": "Explore" }, { - "id": "urn:li:container:59a5aa45397364e6882e793f1bc77b42", - "urn": "urn:li:container:59a5aa45397364e6882e793f1bc77b42" + "id": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5", + "urn": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5" } ] } @@ -1283,12 +1287,12 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.BrowsePaths": { "paths": [ - "/Explore/order_model" + "/Explore/data" ] } }, @@ -1300,10 +1304,13 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { + "project": "lkml_samples", + "model": "data", "looker.explore.label": "My Explore View", + "looker.explore.name": "my_view", "looker.explore.file": "test_source_file.lkml" }, - "externalUrl": "https://looker.company.com/explore/order_model/order_explore", + "externalUrl": "https://looker.company.com/explore/data/my_view", "name": "My Explore View", "description": "lorem ipsum", "tags": [] @@ -1325,7 +1332,7 @@ }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "order_explore", + "schemaName": "my_view", "platform": "urn:li:dataPlatform:looker", "version": 0, "created": { @@ -1380,7 +1387,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1399,12 +1406,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", "changeType": "UPSERT", "aspectName": "embed", "aspect": { "json": { - "renderUrl": "https://looker.company.com/embed/explore/order_model/order_explore" + "renderUrl": "https://looker.company.com/embed/explore/data/my_view" } }, "systemMetadata": { @@ -1416,12 +1423,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60" + "container": "urn:li:container:59a5aa45397364e6882e793f1bc77b42" } }, "systemMetadata": { @@ -1433,7 +1440,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -1443,8 +1450,8 @@ "id": "Explore" }, { - "id": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60", - "urn": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60" + "id": "urn:li:container:59a5aa45397364e6882e793f1bc77b42", + "urn": "urn:li:container:59a5aa45397364e6882e793f1bc77b42" } ] } @@ -1459,12 +1466,12 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.BrowsePaths": { "paths": [ - "/Explore/sales_model" + "/Explore/order_model" ] } }, @@ -1476,10 +1483,13 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { + "project": "lkml_samples", + "model": "order_model", "looker.explore.label": "My Explore View", + "looker.explore.name": "order_explore", "looker.explore.file": "test_source_file.lkml" }, - "externalUrl": "https://looker.company.com/explore/sales_model/sales_explore", + "externalUrl": "https://looker.company.com/explore/order_model/order_explore", "name": "My Explore View", "description": "lorem ipsum", "tags": [] @@ -1501,7 +1511,7 @@ }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "sales_explore", + "schemaName": "order_explore", "platform": "urn:li:dataPlatform:looker", "version": 0, "created": { @@ -1556,7 +1566,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1575,12 +1585,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", "changeType": "UPSERT", "aspectName": "embed", "aspect": { "json": { - "renderUrl": "https://looker.company.com/embed/explore/sales_model/sales_explore" + "renderUrl": "https://looker.company.com/embed/explore/order_model/order_explore" } }, "systemMetadata": { @@ -1592,12 +1602,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5" + "container": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60" } }, "systemMetadata": { @@ -1609,7 +1619,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -1619,8 +1629,8 @@ "id": "Explore" }, { - "id": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5", - "urn": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5" + "id": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60", + "urn": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60" } ] } diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest.json index d969ef62a96e5f..9ac95b8482a475 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest.json @@ -229,6 +229,7 @@ "urn:li:chart:(looker,ap-south-1.dashboard_elements.2)" ], "datasets": [], + "dashboards": [], "lastModified": { "created": { "time": 1586847600000, @@ -574,7 +575,10 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { + "project": "lkml_samples", + "model": "data", "looker.explore.label": "My Explore View", + "looker.explore.name": "my_view", "looker.explore.file": "test_source_file.lkml" }, "externalUrl": "https://looker.company.com/explore/data/my_view", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json index 153db363c78280..3a2c6359ea63c2 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json @@ -202,6 +202,7 @@ "urn:li:chart:(looker,dashboard_elements.2)" ], "datasets": [], + "dashboards": [], "lastModified": { "created": { "time": 1586847600000, @@ -520,7 +521,10 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { + "project": "lkml_samples", + "model": "data", "looker.explore.label": "My Explore View", + "looker.explore.name": "my_view", "looker.explore.file": "test_source_file.lkml" }, "externalUrl": "https://looker.company.com/explore/data/my_view", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json index 98adbdc5b829e4..007eee348aeaf8 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json @@ -11,6 +11,7 @@ "description": "lorem ipsum", "charts": [], "datasets": [], + "dashboards": [], "lastModified": { "created": { "time": 1586847600000, @@ -282,7 +283,10 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { + "project": "lkml_samples", + "model": "data", "looker.explore.label": "My Explore View", + "looker.explore.name": "my_view", "looker.explore.file": "test_source_file.lkml" }, "externalUrl": "https://looker.company.com/explore/data/my_view", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_non_personal_independent_look.json b/metadata-ingestion/tests/integration/looker/golden_test_non_personal_independent_look.json index 63ffdda8c5b6f5..859b9163d7aad6 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_non_personal_independent_look.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_non_personal_independent_look.json @@ -210,6 +210,7 @@ "urn:li:chart:(looker,dashboard_elements.2)" ], "datasets": [], + "dashboards": [], "lastModified": { "created": { "time": 1586847600000, @@ -783,7 +784,10 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { + "project": "lkml_samples", + "model": "data", "looker.explore.label": "My Explore View", + "looker.explore.name": "my_view", "looker.explore.file": "test_source_file.lkml" }, "externalUrl": "https://looker.company.com/explore/data/my_view", @@ -959,7 +963,10 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { + "project": "lkml_samples", + "model": "sales_model", "looker.explore.label": "My Explore View", + "looker.explore.name": "sales_explore", "looker.explore.file": "test_source_file.lkml" }, "externalUrl": "https://looker.company.com/explore/sales_model/sales_explore", diff --git a/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json b/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json index 567ab78a14754b..8256c984afb274 100644 --- a/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json +++ b/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json @@ -210,6 +210,7 @@ "urn:li:chart:(looker,dashboard_elements.2)" ], "datasets": [], + "dashboards": [], "lastModified": { "created": { "time": 1586847600000, @@ -539,7 +540,10 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { + "project": "lkml_samples", + "model": "data", "looker.explore.label": "My Explore View", + "looker.explore.name": "my_view", "looker.explore.file": "test_source_file.lkml" }, "externalUrl": "https://looker.company.com/explore/data/my_view", @@ -810,8 +814,8 @@ } }, { - "entityType": "chart", - "entityUrn": "urn:li:chart:(looker,dashboard_elements.10)", + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(looker,dashboards.11)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -827,8 +831,8 @@ } }, { - "entityType": "container", - "entityUrn": "urn:li:container:621eb6e00da9abece0f64522f81be0e7", + "entityType": "chart", + "entityUrn": "urn:li:chart:(looker,dashboard_elements.10)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -844,8 +848,8 @@ } }, { - "entityType": "dashboard", - "entityUrn": "urn:li:dashboard:(looker,dashboards.11)", + "entityType": "container", + "entityUrn": "urn:li:container:621eb6e00da9abece0f64522f81be0e7", "changeType": "UPSERT", "aspectName": "status", "aspect": { diff --git a/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json b/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json index 3befb62a631de5..0b3530f9c24629 100644 --- a/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json +++ b/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json @@ -11,6 +11,7 @@ "description": "lorem ipsum", "charts": [], "datasets": [], + "dashboards": [], "lastModified": { "created": { "time": 1586847600000, @@ -234,7 +235,10 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { + "project": "lkml_samples", + "model": "data", "looker.explore.label": "My Explore View", + "looker.explore.name": "my_view", "looker.explore.file": "test_source_file.lkml" }, "externalUrl": "https://looker.company.com/explore/data/my_view", From 12abda4e4d21dfd94081956c9f9f82934eb1806b Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Fri, 18 Oct 2024 14:29:03 -0700 Subject: [PATCH 12/31] feat(ingest/fivetran): protect against high sync volume (#11589) --- .../ingestion/source/fivetran/fivetran.py | 21 ++++--- .../source/fivetran/fivetran_log_api.py | 63 ++++++++++--------- .../source/fivetran/fivetran_query.py | 34 +++++++--- .../integration/fivetran/test_fivetran.py | 58 ++++------------- 4 files changed, 87 insertions(+), 89 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py index 704a6f20a5c19b..334bb58ea84f8e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py +++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py @@ -27,7 +27,10 @@ PlatformDetail, ) from datahub.ingestion.source.fivetran.data_classes import Connector, Job -from datahub.ingestion.source.fivetran.fivetran_log_api import FivetranLogAPI +from datahub.ingestion.source.fivetran.fivetran_log_api import ( + MAX_JOBS_PER_CONNECTOR, + FivetranLogAPI, +) from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalHandler, ) @@ -72,11 +75,6 @@ def __init__(self, config: FivetranSourceConfig, ctx: PipelineContext): self.audit_log = FivetranLogAPI(self.config.fivetran_log_config) - # Create and register the stateful ingestion use-case handler. - self.stale_entity_removal_handler = StaleEntityRemovalHandler.create( - self, self.config, self.ctx - ) - def _extend_lineage(self, connector: Connector, datajob: DataJob) -> None: input_dataset_urn_list: List[DatasetUrn] = [] output_dataset_urn_list: List[DatasetUrn] = [] @@ -267,6 +265,13 @@ def _get_connector_workunits( ).as_workunit(is_primary_source=False) # Map Fivetran's job/sync history entity with Datahub's data process entity + if len(connector.jobs) >= MAX_JOBS_PER_CONNECTOR: + self.report.warning( + title="Not all sync history was captured", + message=f"The connector had more than {MAX_JOBS_PER_CONNECTOR} sync runs in the past {self.config.history_sync_lookback_period} days. " + f"Only the most recent {MAX_JOBS_PER_CONNECTOR} syncs were ingested.", + context=f"{connector.connector_name} (connector_id: {connector.connector_id})", + ) for job in connector.jobs: dpi = self._generate_dpi_from_job(job, datajob) yield from self._get_dpi_workunits(job, dpi) @@ -279,7 +284,9 @@ def create(cls, config_dict: dict, ctx: PipelineContext) -> Source: def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: return [ *super().get_workunit_processors(), - self.stale_entity_removal_handler.workunit_processor, + StaleEntityRemovalHandler.create( + self, self.config, self.ctx + ).workunit_processor, ] def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py index 31c16139066e43..5908efe39e2b40 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py +++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py @@ -22,6 +22,10 @@ logger: logging.Logger = logging.getLogger(__name__) +# We don't want to generate a massive number of dataProcesses for a single connector. +# This is primarily used as a safeguard to prevent performance issues. +MAX_JOBS_PER_CONNECTOR = 1000 + class FivetranLogAPI: def __init__(self, fivetran_log_config: FivetranLogConfig) -> None: @@ -158,34 +162,32 @@ def _get_table_lineage( return table_lineage_list - def _get_all_connector_sync_logs(self, syncs_interval: int) -> Dict[str, Dict]: - sync_logs = {} - for row in self._query( - self.fivetran_log_query.get_sync_logs_query().format( - db_clause=self.fivetran_log_query.db_clause, - syncs_interval=syncs_interval, - ) - ): - if row[Constant.CONNECTOR_ID] not in sync_logs: - sync_logs[row[Constant.CONNECTOR_ID]] = { - row[Constant.SYNC_ID]: { - row["message_event"]: ( - row[Constant.TIME_STAMP].timestamp(), - row[Constant.MESSAGE_DATA], - ) - } - } - elif row[Constant.SYNC_ID] not in sync_logs[row[Constant.CONNECTOR_ID]]: - sync_logs[row[Constant.CONNECTOR_ID]][row[Constant.SYNC_ID]] = { - row["message_event"]: ( - row[Constant.TIME_STAMP].timestamp(), - row[Constant.MESSAGE_DATA], - ) - } - else: - sync_logs[row[Constant.CONNECTOR_ID]][row[Constant.SYNC_ID]][ - row["message_event"] - ] = (row[Constant.TIME_STAMP].timestamp(), row[Constant.MESSAGE_DATA]) + def _get_all_connector_sync_logs( + self, syncs_interval: int, connector_ids: List[str] + ) -> Dict[str, Dict[str, Dict[str, Tuple[float, Optional[str]]]]]: + sync_logs: Dict[str, Dict[str, Dict[str, Tuple[float, Optional[str]]]]] = {} + + # Format connector_ids as a comma-separated string of quoted IDs + formatted_connector_ids = ", ".join(f"'{id}'" for id in connector_ids) + + query = self.fivetran_log_query.get_sync_logs_query().format( + db_clause=self.fivetran_log_query.db_clause, + syncs_interval=syncs_interval, + max_jobs_per_connector=MAX_JOBS_PER_CONNECTOR, + connector_ids=formatted_connector_ids, + ) + + for row in self._query(query): + connector_id = row[Constant.CONNECTOR_ID] + sync_id = row[Constant.SYNC_ID] + + if connector_id not in sync_logs: + sync_logs[connector_id] = {} + + sync_logs[connector_id][sync_id] = { + "sync_start": (row["start_time"].timestamp(), None), + "sync_end": (row["end_time"].timestamp(), row["end_message_data"]), + } return sync_logs @@ -244,7 +246,10 @@ def _fill_connectors_table_lineage(self, connectors: List[Connector]) -> None: def _fill_connectors_jobs( self, connectors: List[Connector], syncs_interval: int ) -> None: - sync_logs = self._get_all_connector_sync_logs(syncs_interval) + connector_ids = [connector.connector_id for connector in connectors] + sync_logs = self._get_all_connector_sync_logs( + syncs_interval, connector_ids=connector_ids + ) for connector in connectors: connector.jobs = self._get_jobs_list(sync_logs.get(connector.connector_id)) diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py index d965f53ff554b3..c4680b4b1037a2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py +++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py @@ -37,14 +37,32 @@ def get_users_query(self) -> str: def get_sync_logs_query(self) -> str: return """ - SELECT connector_id, - sync_id, - message_event, - message_data, - time_stamp - FROM {db_clause}log - WHERE message_event in ('sync_start', 'sync_end') - and time_stamp > CURRENT_TIMESTAMP - INTERVAL '{syncs_interval} days'""" + WITH ranked_syncs AS ( + SELECT + connector_id, + sync_id, + MAX(CASE WHEN message_event = 'sync_start' THEN time_stamp END) as start_time, + MAX(CASE WHEN message_event = 'sync_end' THEN time_stamp END) as end_time, + MAX(CASE WHEN message_event = 'sync_end' THEN message_data END) as end_message_data, + ROW_NUMBER() OVER (PARTITION BY connector_id ORDER BY MAX(time_stamp) DESC) as rn + FROM {db_clause}log + WHERE message_event in ('sync_start', 'sync_end') + AND time_stamp > CURRENT_TIMESTAMP - INTERVAL '{syncs_interval} days' + AND connector_id IN ({connector_ids}) + GROUP BY connector_id, sync_id + ) + SELECT + connector_id, + sync_id, + start_time, + end_time, + end_message_data + FROM ranked_syncs + WHERE rn <= {max_jobs_per_connector} + AND start_time IS NOT NULL + AND end_time IS NOT NULL + ORDER BY connector_id, end_time DESC + """ def get_table_lineage_query(self) -> str: return f""" diff --git a/metadata-ingestion/tests/integration/fivetran/test_fivetran.py b/metadata-ingestion/tests/integration/fivetran/test_fivetran.py index 0f5d098ee39c4a..33ac09e69a3c0a 100644 --- a/metadata-ingestion/tests/integration/fivetran/test_fivetran.py +++ b/metadata-ingestion/tests/integration/fivetran/test_fivetran.py @@ -101,64 +101,32 @@ def default_query_results( } ] elif query == fivetran_log_query.get_sync_logs_query().format( - db_clause=fivetran_log_query.db_clause, syncs_interval=7 + db_clause=fivetran_log_query.db_clause, + syncs_interval=7, + max_jobs_per_connector=1000, + connector_ids="'calendar_elected'", ): return [ { "connector_id": "calendar_elected", "sync_id": "4c9a03d6-eded-4422-a46a-163266e58243", - "message_event": "sync_start", - "message_data": None, - "time_stamp": datetime.datetime(2023, 9, 20, 6, 37, 32, 606000), + "start_time": datetime.datetime(2023, 9, 20, 6, 37, 32, 606000), + "end_time": datetime.datetime(2023, 9, 20, 6, 38, 5, 56000), + "end_message_data": '"{\\"status\\":\\"SUCCESSFUL\\"}"', }, { "connector_id": "calendar_elected", "sync_id": "f773d1e9-c791-48f4-894f-8cf9b3dfc834", - "message_event": "sync_start", - "message_data": None, - "time_stamp": datetime.datetime(2023, 10, 3, 14, 35, 30, 345000), + "start_time": datetime.datetime(2023, 10, 3, 14, 35, 30, 345000), + "end_time": datetime.datetime(2023, 10, 3, 14, 35, 31, 512000), + "end_message_data": '"{\\"reason\\":\\"Sync has been cancelled because of a user action in the dashboard.Standard Config updated.\\",\\"status\\":\\"CANCELED\\"}"', }, { "connector_id": "calendar_elected", "sync_id": "63c2fc85-600b-455f-9ba0-f576522465be", - "message_event": "sync_start", - "message_data": None, - "time_stamp": datetime.datetime(2023, 10, 3, 14, 35, 55, 401000), - }, - { - "connector_id": "calendar_elected", - "sync_id": "e773e1e9-c791-46f4-894f-8ch9b3dfc832", - "message_event": "sync_start", - "message_data": None, - "time_stamp": datetime.datetime(2023, 10, 3, 14, 37, 5, 403000), - }, - { - "connector_id": "calendar_elected", - "sync_id": "4c9a03d6-eded-4422-a46a-163266e58243", - "message_event": "sync_end", - "message_data": '"{\\"status\\":\\"SUCCESSFUL\\"}"', - "time_stamp": datetime.datetime(2023, 9, 20, 6, 38, 5, 56000), - }, - { - "connector_id": "calendar_elected", - "sync_id": "f773d1e9-c791-48f4-894f-8cf9b3dfc834", - "message_event": "sync_end", - "message_data": '"{\\"reason\\":\\"Sync has been cancelled because of a user action in the dashboard.Standard Config updated.\\",\\"status\\":\\"CANCELED\\"}"', - "time_stamp": datetime.datetime(2023, 10, 3, 14, 35, 31, 512000), - }, - { - "connector_id": "calendar_elected", - "sync_id": "63c2fc85-600b-455f-9ba0-f576522465be", - "message_event": "sync_end", - "message_data": '"{\\"reason\\":\\"java.lang.RuntimeException: FATAL: too many connections for role \\\\\\"hxwraqld\\\\\\"\\",\\"taskType\\":\\"reconnect\\",\\"status\\":\\"FAILURE_WITH_TASK\\"}"', - "time_stamp": datetime.datetime(2023, 10, 3, 14, 36, 29, 678000), - }, - { - "connector_id": "calendar_elected", - "sync_id": "e773e1e9-c791-46f4-894f-8ch9b3dfc832", - "message_event": "sync_end", - "message_data": None, - "time_stamp": datetime.datetime(2023, 10, 3, 14, 37, 35, 478000), + "start_time": datetime.datetime(2023, 10, 3, 14, 35, 55, 401000), + "end_time": datetime.datetime(2023, 10, 3, 14, 36, 29, 678000), + "end_message_data": '"{\\"reason\\":\\"java.lang.RuntimeException: FATAL: too many connections for role \\\\\\"hxwraqld\\\\\\"\\",\\"taskType\\":\\"reconnect\\",\\"status\\":\\"FAILURE_WITH_TASK\\"}"', }, ] # Unreachable code From bda79bd489753ec017a503348aa57af617db99a9 Mon Sep 17 00:00:00 2001 From: Shirshanka Das Date: Sat, 19 Oct 2024 14:53:28 -0700 Subject: [PATCH 13/31] feat(sdk):platform-resource - complex queries (#11675) --- .../src/datahub/utilities/openapi_utils.py | 69 +++++ .../src/datahub/utilities/search_utils.py | 285 ++++++++++++++++++ .../test_platform_resource.py | 15 + .../tests/unit/utilities/test_search_utils.py | 71 +++++ .../test_platform_resource.py | 78 ++++- 5 files changed, 508 insertions(+), 10 deletions(-) create mode 100644 metadata-ingestion/src/datahub/utilities/openapi_utils.py create mode 100644 metadata-ingestion/src/datahub/utilities/search_utils.py create mode 100644 metadata-ingestion/tests/unit/utilities/test_search_utils.py diff --git a/metadata-ingestion/src/datahub/utilities/openapi_utils.py b/metadata-ingestion/src/datahub/utilities/openapi_utils.py new file mode 100644 index 00000000000000..e704ff7f84cbbc --- /dev/null +++ b/metadata-ingestion/src/datahub/utilities/openapi_utils.py @@ -0,0 +1,69 @@ +import logging +from typing import Iterable, Union + +import datahub.metadata.schema_classes as models +from datahub.ingestion.graph.client import DataHubGraph +from datahub.utilities.search_utils import ( + ElasticDocumentQuery, + ElasticsearchQueryBuilder, +) + +logger = logging.getLogger(__name__) + + +class OpenAPIGraphClient: + """ + An experimental client for the DataHubGraph that uses the OpenAPI endpoints + to query entities and aspects. + Does not support all features of the DataHubGraph. + API is subject to change. + + DO NOT USE THIS UNLESS YOU KNOW WHAT YOU ARE DOING. + """ + + ENTITY_KEY_ASPECT_MAP = { + aspect_type.ASPECT_INFO.get("keyForEntity"): name + for name, aspect_type in models.ASPECT_NAME_MAP.items() + if aspect_type.ASPECT_INFO.get("keyForEntity") + } + + def __init__(self, graph: DataHubGraph): + self.graph = graph + self.openapi_base = graph._gms_server.rstrip("/") + "/openapi/v3" + + def scroll_urns_by_filter( + self, + entity_type: str, + query: Union[ElasticDocumentQuery, ElasticsearchQueryBuilder], + ) -> Iterable[str]: + """ + Scroll through all urns that match the given filters. + + """ + + key_aspect = self.ENTITY_KEY_ASPECT_MAP.get(entity_type) + assert key_aspect, f"No key aspect found for entity type {entity_type}" + + count = 1000 + string_query = query.build() + scroll_id = None + logger.debug(f"Scrolling with query: {string_query}") + while True: + response = self.graph._get_generic( + self.openapi_base + f"/entity/{entity_type.lower()}", + params={ + "systemMetadata": "false", + "includeSoftDelete": "false", + "skipCache": "false", + "aspects": [key_aspect], + "scrollId": scroll_id, + "count": count, + "query": string_query, + }, + ) + entities = response.get("entities", []) + scroll_id = response.get("scrollId") + for entity in entities: + yield entity["urn"] + if not scroll_id: + break diff --git a/metadata-ingestion/src/datahub/utilities/search_utils.py b/metadata-ingestion/src/datahub/utilities/search_utils.py new file mode 100644 index 00000000000000..0bd88addd86600 --- /dev/null +++ b/metadata-ingestion/src/datahub/utilities/search_utils.py @@ -0,0 +1,285 @@ +import logging +import re +from enum import Enum +from typing import Generic, List, Optional, Tuple, Type, TypeVar, Union + +logger = logging.getLogger(__name__) + + +class LogicalOperator(Enum): + AND = "AND" + OR = "OR" + + +class SearchField: + def __init__(self, field_name: str): + self.field_name = field_name + + def get_search_value(self, value: str) -> str: + return value + + def __str__(self) -> str: + return self.field_name + + def __repr__(self) -> str: + return self.__str__() + + @classmethod + def from_string_field(cls, field_name: str) -> "SearchField": + return cls(field_name) + + +class QueryNode: + def __init__(self, operator: Optional[LogicalOperator] = None): + self.operator = operator + self.children: List[Union[QueryNode, str]] = [] + + def add_child(self, child: Union["QueryNode", str]) -> None: + self.children.append(child) + + def build(self) -> str: + if not self.children: + return "" + + if self.operator is None: + return ( + self.children[0] + if isinstance(self.children[0], str) + else self.children[0].build() + ) + + child_queries = [] + for child in self.children: + if isinstance(child, str): + child_queries.append(child) + else: + child_queries.append(child.build()) + + joined_queries = f" {self.operator.value} ".join(child_queries) + return f"({joined_queries})" if len(child_queries) > 1 else joined_queries + + +class ElasticsearchQueryBuilder: + SPECIAL_CHARACTERS = r'+-=&|> None: + self.root = QueryNode(operator=operator) + + @classmethod + def escape_special_characters(cls, value: str) -> str: + """ + Escape special characters in the search term. + """ + return re.sub(f"([{re.escape(cls.SPECIAL_CHARACTERS)}])", r"\\\1", value) + + def _create_term( + self, field: SearchField, value: str, is_exact: bool = False + ) -> str: + escaped_value = self.escape_special_characters(field.get_search_value(value)) + field_name: str = field.field_name + if is_exact: + return f'{field_name}:"{escaped_value}"' + return f"{field_name}:{escaped_value}" + + def add_field_match( + self, field: SearchField, value: str, is_exact: bool = True + ) -> "ElasticsearchQueryBuilder": + term = self._create_term(field, value, is_exact) + self.root.add_child(term) + return self + + def add_field_not_match( + self, field: SearchField, value: str, is_exact: bool = True + ) -> "ElasticsearchQueryBuilder": + term = f"-{self._create_term(field, value, is_exact)}" + self.root.add_child(term) + return self + + def add_range( + self, + field: str, + min_value: Optional[str] = None, + max_value: Optional[str] = None, + include_min: bool = True, + include_max: bool = True, + ) -> "ElasticsearchQueryBuilder": + min_bracket = "[" if include_min else "{" + max_bracket = "]" if include_max else "}" + min_val = min_value if min_value is not None else "*" + max_val = max_value if max_value is not None else "*" + range_query = f"{field}:{min_bracket}{min_val} TO {max_val}{max_bracket}" + self.root.add_child(range_query) + return self + + def add_wildcard(self, field: str, pattern: str) -> "ElasticsearchQueryBuilder": + wildcard_query = f"{field}:{pattern}" + self.root.add_child(wildcard_query) + return self + + def add_fuzzy( + self, field: str, value: str, fuzziness: int = 2 + ) -> "ElasticsearchQueryBuilder": + fuzzy_query = f"{field}:{value}~{fuzziness}" + self.root.add_child(fuzzy_query) + return self + + def add_boost( + self, field: str, value: str, boost: float + ) -> "ElasticsearchQueryBuilder": + boosted_query = f"{field}:{value}^{boost}" + self.root.add_child(boosted_query) + return self + + def group(self, operator: LogicalOperator) -> "QueryGroup": + return QueryGroup(self, operator) + + def build(self) -> str: + return self.root.build() + + +class QueryGroup: + def __init__(self, parent: ElasticsearchQueryBuilder, operator: LogicalOperator): + self.parent = parent + self.node = QueryNode(operator) + self.parent.root.add_child(self.node) + + def add_field_match( + self, field: Union[str, SearchField], value: str, is_exact: bool = True + ) -> "QueryGroup": + if isinstance(field, str): + field = SearchField.from_string_field(field) + term = self.parent._create_term(field, value, is_exact) + self.node.add_child(term) + return self + + def add_field_not_match( + self, field: Union[str, SearchField], value: str, is_exact: bool = True + ) -> "QueryGroup": + if isinstance(field, str): + field = SearchField.from_string_field(field) + term = f"-{self.parent._create_term(field, value, is_exact)}" + self.node.add_child(term) + return self + + def add_range( + self, + field: str, + min_value: Optional[str] = None, + max_value: Optional[str] = None, + include_min: bool = True, + include_max: bool = True, + ) -> "QueryGroup": + min_bracket = "[" if include_min else "{" + max_bracket = "]" if include_max else "}" + min_val = min_value if min_value is not None else "*" + max_val = max_value if max_value is not None else "*" + range_query = f"{field}:{min_bracket}{min_val} TO {max_val}{max_bracket}" + self.node.add_child(range_query) + return self + + def add_wildcard(self, field: str, pattern: str) -> "QueryGroup": + wildcard_query = f"{field}:{pattern}" + self.node.add_child(wildcard_query) + return self + + def add_fuzzy(self, field: str, value: str, fuzziness: int = 2) -> "QueryGroup": + fuzzy_query = f"{field}:{value}~{fuzziness}" + self.node.add_child(fuzzy_query) + return self + + def add_boost(self, field: str, value: str, boost: float) -> "QueryGroup": + boosted_query = f"{field}:{value}^{boost}" + self.node.add_child(boosted_query) + return self + + def group(self, operator: LogicalOperator) -> "QueryGroup": + new_group = QueryGroup(self.parent, operator) + self.node.add_child(new_group.node) + return new_group + + def end(self) -> ElasticsearchQueryBuilder: + return self.parent + + +SF = TypeVar("SF", bound=SearchField) + + +class ElasticDocumentQuery(Generic[SF]): + def __init__(self) -> None: + self.query_builder = ElasticsearchQueryBuilder() + + @classmethod + def create_from( + cls: Type["ElasticDocumentQuery[SF]"], + *args: Tuple[Union[str, SF], str], + ) -> "ElasticDocumentQuery[SF]": + instance = cls() + for arg in args: + if isinstance(arg, SearchField): + # If the value is empty, we treat it as a wildcard search + logger.info(f"Adding wildcard search for field {arg}") + instance.add_wildcard(arg, "*") + elif isinstance(arg, tuple) and len(arg) == 2: + field, value = arg + assert isinstance(value, str) + if isinstance(field, SearchField): + instance.add_field_match(field, value) + elif isinstance(field, str): + instance.add_field_match( + SearchField.from_string_field(field), value + ) + else: + raise ValueError("Invalid field type {}".format(type(field))) + return instance + + def add_field_match( + self, field: Union[str, SearchField], value: str, is_exact: bool = True + ) -> "ElasticDocumentQuery": + if isinstance(field, str): + field = SearchField.from_string_field(field) + self.query_builder.add_field_match(field, value, is_exact) + return self + + def add_field_not_match( + self, field: SearchField, value: str, is_exact: bool = True + ) -> "ElasticDocumentQuery": + self.query_builder.add_field_not_match(field, value, is_exact) + return self + + def add_range( + self, + field: SearchField, + min_value: Optional[str] = None, + max_value: Optional[str] = None, + include_min: bool = True, + include_max: bool = True, + ) -> "ElasticDocumentQuery": + field_name: str = field.field_name # type: ignore + self.query_builder.add_range( + field_name, min_value, max_value, include_min, include_max + ) + return self + + def add_wildcard(self, field: SearchField, pattern: str) -> "ElasticDocumentQuery": + field_name: str = field.field_name # type: ignore + self.query_builder.add_wildcard(field_name, pattern) + return self + + def add_fuzzy( + self, field: SearchField, value: str, fuzziness: int = 2 + ) -> "ElasticDocumentQuery": + field_name: str = field.field_name # type: ignore + self.query_builder.add_fuzzy(field_name, value, fuzziness) + return self + + def add_boost( + self, field: SearchField, value: str, boost: float + ) -> "ElasticDocumentQuery": + self.query_builder.add_boost(field.field_name, value, boost) + return self + + def group(self, operator: LogicalOperator) -> QueryGroup: + return self.query_builder.group(operator) + + def build(self) -> str: + return self.query_builder.build() diff --git a/metadata-ingestion/tests/unit/api/entities/platformresource/test_platform_resource.py b/metadata-ingestion/tests/unit/api/entities/platformresource/test_platform_resource.py index e6c9a9466d62b4..a84e373dbe72c2 100644 --- a/metadata-ingestion/tests/unit/api/entities/platformresource/test_platform_resource.py +++ b/metadata-ingestion/tests/unit/api/entities/platformresource/test_platform_resource.py @@ -4,9 +4,12 @@ import datahub.metadata.schema_classes as models from datahub.api.entities.platformresource.platform_resource import ( + ElasticPlatformResourceQuery, PlatformResource, PlatformResourceKey, + PlatformResourceSearchFields, ) +from datahub.utilities.search_utils import LogicalOperator def test_platform_resource_dict(): @@ -179,3 +182,15 @@ class TestModel(BaseModel): ).encode("utf-8") assert platform_resource_info_mcp.aspect.value.schemaType == "JSON" assert platform_resource_info_mcp.aspect.value.schemaRef == TestModel.__name__ + + +def test_platform_resource_filters(): + + query = ( + ElasticPlatformResourceQuery.create_from() + .group(LogicalOperator.AND) + .add_field_match(PlatformResourceSearchFields.PRIMARY_KEY, "test_1") + .add_field_match(PlatformResourceSearchFields.RESOURCE_TYPE, "server") + .end() + ) + assert query.build() == '(primaryKey:"test_1" AND resourceType:"server")' diff --git a/metadata-ingestion/tests/unit/utilities/test_search_utils.py b/metadata-ingestion/tests/unit/utilities/test_search_utils.py new file mode 100644 index 00000000000000..6fa2e46c7f20e8 --- /dev/null +++ b/metadata-ingestion/tests/unit/utilities/test_search_utils.py @@ -0,0 +1,71 @@ +from datahub.utilities.search_utils import ( + ElasticDocumentQuery, + LogicalOperator, + SearchField, +) + + +def test_simple_and_filters(): + query = ( + ElasticDocumentQuery.create_from() + .group(LogicalOperator.AND) + .add_field_match("field1", "value1") + .add_field_match("field2", "value2") + .end() + ) + + assert query.build() == '(field1:"value1" AND field2:"value2")' + + +def test_simple_or_filters(): + query = ( + ElasticDocumentQuery.create_from() + .group(LogicalOperator.OR) + .add_field_match("field1", "value1") + .add_field_match("field2", "value2") + .end() + ) + + assert query.build() == '(field1:"value1" OR field2:"value2")' + + # Use SearchFilter to create this query + query = ( + ElasticDocumentQuery.create_from() + .group(LogicalOperator.OR) + .add_field_match(SearchField.from_string_field("field1"), "value1") + .add_field_match(SearchField.from_string_field("field2"), "value2") + .end() + ) + assert query.build() == '(field1:"value1" OR field2:"value2")' + + +def test_simple_field_match(): + query: ElasticDocumentQuery = ElasticDocumentQuery.create_from( + ("field1", "value1:1") + ) + assert query.build() == 'field1:"value1\\:1"' + + # Another way to create the same query + query = ElasticDocumentQuery.create_from() + query.add_field_match("field1", "value1:1") + assert query.build() == 'field1:"value1\\:1"' + + +def test_negation(): + query = ( + ElasticDocumentQuery.create_from() + .group(LogicalOperator.AND) + .add_field_match("field1", "value1") + .add_field_not_match("field2", "value2") + .end() + ) + + assert query.build() == '(field1:"value1" AND -field2:"value2")' + + +def test_multi_arg_create_from(): + query: ElasticDocumentQuery = ElasticDocumentQuery.create_from( + ("field1", "value1"), + ("field2", "value2"), + ) + assert query.build() == '(field1:"value1" AND field2:"value2")' diff --git a/smoke-test/tests/platform_resources/test_platform_resource.py b/smoke-test/tests/platform_resources/test_platform_resource.py index 7ebfd4d6ea15b4..39d15f2e8dea6d 100644 --- a/smoke-test/tests/platform_resources/test_platform_resource.py +++ b/smoke-test/tests/platform_resources/test_platform_resource.py @@ -5,8 +5,10 @@ import pytest from datahub.api.entities.platformresource.platform_resource import ( + ElasticPlatformResourceQuery, PlatformResource, PlatformResourceKey, + PlatformResourceSearchFields, ) from tests.utils import wait_for_healthcheck_util, wait_for_writes_to_sync @@ -42,7 +44,12 @@ def cleanup_resources(graph_client): logger.warning(f"Failed to delete resource: {e}") # Additional cleanup for any resources that might have been missed - for resource in PlatformResource.search_by_key(graph_client, "test_"): + for resource in PlatformResource.search_by_filters( + graph_client, + ElasticPlatformResourceQuery.create_from().add_wildcard( + PlatformResourceSearchFields.PRIMARY_KEY, "test_*" + ), + ): try: resource.delete(graph_client) except Exception as e: @@ -114,7 +121,7 @@ def test_platform_resource_non_existent(graph_client, test_id): assert platform_resource is None -def test_platform_resource_urn_secondary_key(graph_client, test_id): +def test_platform_resource_urn_secondary_key(graph_client, test_id, cleanup_resources): key = PlatformResourceKey( platform=f"test_platform_{test_id}", resource_type=f"test_resource_type_{test_id}", @@ -129,6 +136,7 @@ def test_platform_resource_urn_secondary_key(graph_client, test_id): secondary_keys=[dataset_urn], ) platform_resource.to_datahub(graph_client) + cleanup_resources.append(platform_resource) wait_for_writes_to_sync() read_platform_resources = [ @@ -141,7 +149,9 @@ def test_platform_resource_urn_secondary_key(graph_client, test_id): assert read_platform_resources[0] == platform_resource -def test_platform_resource_listing_by_resource_type(graph_client, test_id): +def test_platform_resource_listing_by_resource_type( + graph_client, test_id, cleanup_resources +): # Generate two resources with the same resource type key1 = PlatformResourceKey( platform=f"test_platform_{test_id}", @@ -171,13 +181,9 @@ def test_platform_resource_listing_by_resource_type(graph_client, test_id): r for r in PlatformResource.search_by_filters( graph_client, - and_filters=[ - { - "field": "resourceType", - "condition": "EQUAL", - "value": key1.resource_type, - } - ], + query=ElasticPlatformResourceQuery.create_from( + (PlatformResourceSearchFields.RESOURCE_TYPE, key1.resource_type) + ), ) ] assert len(search_results) == 2 @@ -186,3 +192,55 @@ def test_platform_resource_listing_by_resource_type(graph_client, test_id): read_platform_resource_2 = next(r for r in search_results if r.id == key2.id) assert read_platform_resource_1 == platform_resource1 assert read_platform_resource_2 == platform_resource2 + + +def test_platform_resource_listing_complex_queries(graph_client, test_id): + # Generate two resources with the same resource type + key1 = PlatformResourceKey( + platform=f"test_platform1_{test_id}", + resource_type=f"test_resource_type_{test_id}", + primary_key=f"test_primary_key_1_{test_id}", + ) + platform_resource1 = PlatformResource.create( + key=key1, + value={"test_key": f"test_value_1_{test_id}"}, + ) + platform_resource1.to_datahub(graph_client) + + key2 = PlatformResourceKey( + platform=f"test_platform2_{test_id}", + resource_type=f"test_resource_type_{test_id}", + primary_key=f"test_primary_key_2_{test_id}", + ) + platform_resource2 = PlatformResource.create( + key=key2, + value={"test_key": f"test_value_2_{test_id}"}, + ) + platform_resource2.to_datahub(graph_client) + + wait_for_writes_to_sync() + from datahub.api.entities.platformresource.platform_resource import ( + ElasticPlatformResourceQuery, + LogicalOperator, + PlatformResourceSearchFields, + ) + + query = ( + ElasticPlatformResourceQuery.create_from() + .group(LogicalOperator.AND) + .add_field_match(PlatformResourceSearchFields.RESOURCE_TYPE, key1.resource_type) + .add_field_not_match(PlatformResourceSearchFields.PLATFORM, key1.platform) + .end() + ) + + search_results = [ + r + for r in PlatformResource.search_by_filters( + graph_client, + query=query, + ) + ] + assert len(search_results) == 1 + + read_platform_resource = search_results[0] + assert read_platform_resource == platform_resource2 From 03c9de6ae5d3060dbcb98014dc3df91444904bb1 Mon Sep 17 00:00:00 2001 From: deepgarg-visa <149145061+deepgarg-visa@users.noreply.github.com> Date: Sun, 20 Oct 2024 06:58:32 +0530 Subject: [PATCH 14/31] fix(docs): fix businessattributes doc (#11653) --- docs/businessattributes.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/businessattributes.md b/docs/businessattributes.md index 1744f48f879e82..3e912e7e609805 100644 --- a/docs/businessattributes.md +++ b/docs/businessattributes.md @@ -28,7 +28,6 @@ Taking the example of "United States- Social Security Number", if an application What you need to create/update and associate business attributes to dataset schema field * **Manage Business Attributes** platform privilege to create/update/delete business attributes. -* **Edit Dataset Column Business Attribute** metadata privilege to associate business attributes to dataset schema field. ## Using Business Attributes As of now Business Attributes can only be created through UI From 9b82a7b029c16b0114c1aeed269e45b11ee9fbe6 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Sun, 20 Oct 2024 23:59:45 -0700 Subject: [PATCH 15/31] feat(ingest/fivetran): add safeguards on table/column lineage (#11674) --- .../ingestion/source/fivetran/config.py | 19 +-- .../ingestion/source/fivetran/data_classes.py | 2 +- .../ingestion/source/fivetran/fivetran.py | 23 ++- .../source/fivetran/fivetran_log_api.py | 86 +++++------ .../source/fivetran/fivetran_query.py | 143 +++++++++++------- .../integration/fivetran/test_fivetran.py | 6 +- 6 files changed, 156 insertions(+), 123 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py index 02eb096b240f52..2fb5ffd16ea34c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py @@ -1,6 +1,6 @@ +import dataclasses import logging -from dataclasses import dataclass, field as dataclass_field -from typing import Dict, List, Optional +from typing import Dict, Optional import pydantic from pydantic import Field, root_validator @@ -23,6 +23,7 @@ from datahub.ingestion.source.state.stateful_ingestion_base import ( StatefulIngestionConfigBase, ) +from datahub.utilities.lossy_collections import LossyList from datahub.utilities.perf_timer import PerfTimer logger = logging.getLogger(__name__) @@ -114,24 +115,24 @@ def validate_destination_platfrom_and_config(cls, values: Dict) -> Dict: return values -@dataclass +@dataclasses.dataclass class MetadataExtractionPerfReport(Report): - connectors_metadata_extraction_sec: PerfTimer = dataclass_field( + connectors_metadata_extraction_sec: PerfTimer = dataclasses.field( default_factory=PerfTimer ) - connectors_lineage_extraction_sec: PerfTimer = dataclass_field( + connectors_lineage_extraction_sec: PerfTimer = dataclasses.field( default_factory=PerfTimer ) - connectors_jobs_extraction_sec: PerfTimer = dataclass_field( + connectors_jobs_extraction_sec: PerfTimer = dataclasses.field( default_factory=PerfTimer ) -@dataclass +@dataclasses.dataclass class FivetranSourceReport(StaleEntityRemovalSourceReport): connectors_scanned: int = 0 - filtered_connectors: List[str] = dataclass_field(default_factory=list) - metadata_extraction_perf: MetadataExtractionPerfReport = dataclass_field( + filtered_connectors: LossyList[str] = dataclasses.field(default_factory=LossyList) + metadata_extraction_perf: MetadataExtractionPerfReport = dataclasses.field( default_factory=MetadataExtractionPerfReport ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/data_classes.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/data_classes.py index 18de2b01edd3b7..046aa9efe3f59b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/data_classes.py +++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/data_classes.py @@ -24,7 +24,7 @@ class Connector: sync_frequency: int destination_id: str user_id: str - table_lineage: List[TableLineage] + lineage: List[TableLineage] jobs: List["Job"] diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py index 334bb58ea84f8e..c27ec57c2e99ec 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py +++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py @@ -27,9 +27,10 @@ PlatformDetail, ) from datahub.ingestion.source.fivetran.data_classes import Connector, Job -from datahub.ingestion.source.fivetran.fivetran_log_api import ( +from datahub.ingestion.source.fivetran.fivetran_log_api import FivetranLogAPI +from datahub.ingestion.source.fivetran.fivetran_query import ( MAX_JOBS_PER_CONNECTOR, - FivetranLogAPI, + MAX_TABLE_LINEAGE_PER_CONNECTOR, ) from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalHandler, @@ -106,13 +107,21 @@ def _extend_lineage(self, connector: Connector, datajob: DataJob) -> None: f"Fivetran connector source type: {connector.connector_type} is not supported to mapped with Datahub dataset entity." ) - for table_lineage in connector.table_lineage: + if len(connector.lineage) >= MAX_TABLE_LINEAGE_PER_CONNECTOR: + self.report.warning( + title="Table lineage truncated", + message=f"The connector had more than {MAX_TABLE_LINEAGE_PER_CONNECTOR} table lineage entries. " + f"Only the most recent {MAX_TABLE_LINEAGE_PER_CONNECTOR} entries were ingested.", + context=f"{connector.connector_name} (connector_id: {connector.connector_id})", + ) + + for lineage in connector.lineage: input_dataset_urn = DatasetUrn.create_from_ids( platform_id=source_platform, table_name=( - f"{source_database.lower()}.{table_lineage.source_table}" + f"{source_database.lower()}.{lineage.source_table}" if source_database - else table_lineage.source_table + else lineage.source_table ), env=source_platform_detail.env, platform_instance=source_platform_detail.platform_instance, @@ -121,14 +130,14 @@ def _extend_lineage(self, connector: Connector, datajob: DataJob) -> None: output_dataset_urn = DatasetUrn.create_from_ids( platform_id=self.config.fivetran_log_config.destination_platform, - table_name=f"{self.audit_log.fivetran_log_database.lower()}.{table_lineage.destination_table}", + table_name=f"{self.audit_log.fivetran_log_database.lower()}.{lineage.destination_table}", env=destination_platform_detail.env, platform_instance=destination_platform_detail.platform_instance, ) output_dataset_urn_list.append(output_dataset_urn) if self.config.include_column_lineage: - for column_lineage in table_lineage.column_lineage: + for column_lineage in lineage.column_lineage: fine_grained_lineage.append( FineGrainedLineage( upstreamType=FineGrainedLineageUpstreamType.FIELD_SET, diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py index 5908efe39e2b40..b55c8bbbd607fa 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py +++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py @@ -1,6 +1,7 @@ import functools import json import logging +from collections import defaultdict from typing import Any, Dict, List, Optional, Tuple import sqlglot @@ -22,10 +23,6 @@ logger: logging.Logger = logging.getLogger(__name__) -# We don't want to generate a massive number of dataProcesses for a single connector. -# This is primarily used as a safeguard to prevent performance issues. -MAX_JOBS_PER_CONNECTOR = 1000 - class FivetranLogAPI: def __init__(self, fivetran_log_config: FivetranLogConfig) -> None: @@ -91,55 +88,51 @@ def _query(self, query: str) -> List[Dict]: resp = self.engine.execute(query) return [row for row in resp] - def _get_column_lineage_metadata(self) -> Dict[str, List]: + def _get_column_lineage_metadata(self) -> Dict[Tuple[str, str], List]: """ - Return's dict of column lineage metadata with key as '-' + Returns dict of column lineage metadata with key as (, ) """ - all_column_lineage: Dict[str, List] = {} + all_column_lineage = defaultdict(list) column_lineage_result = self._query( self.fivetran_log_query.get_column_lineage_query() ) for column_lineage in column_lineage_result: - key = f"{column_lineage[Constant.SOURCE_TABLE_ID]}-{column_lineage[Constant.DESTINATION_TABLE_ID]}" - if key not in all_column_lineage: - all_column_lineage[key] = [column_lineage] - else: - all_column_lineage[key].append(column_lineage) - return all_column_lineage + key = ( + column_lineage[Constant.SOURCE_TABLE_ID], + column_lineage[Constant.DESTINATION_TABLE_ID], + ) + all_column_lineage[key].append(column_lineage) + return dict(all_column_lineage) - def _get_connectors_table_lineage_metadata(self) -> Dict[str, List]: + def _get_table_lineage_metadata(self) -> Dict[str, List]: """ - Return's dict of table lineage metadata with key as 'CONNECTOR_ID' + Returns dict of table lineage metadata with key as 'CONNECTOR_ID' """ - connectors_table_lineage_metadata: Dict[str, List] = {} + connectors_table_lineage_metadata = defaultdict(list) table_lineage_result = self._query( self.fivetran_log_query.get_table_lineage_query() ) for table_lineage in table_lineage_result: - if ( + connectors_table_lineage_metadata[ table_lineage[Constant.CONNECTOR_ID] - not in connectors_table_lineage_metadata - ): - connectors_table_lineage_metadata[ - table_lineage[Constant.CONNECTOR_ID] - ] = [table_lineage] - else: - connectors_table_lineage_metadata[ - table_lineage[Constant.CONNECTOR_ID] - ].append(table_lineage) - return connectors_table_lineage_metadata + ].append(table_lineage) + return dict(connectors_table_lineage_metadata) - def _get_table_lineage( + def _extract_connector_lineage( self, - column_lineage_metadata: Dict[str, List], table_lineage_result: Optional[List], + column_lineage_metadata: Dict[Tuple[str, str], List], ) -> List[TableLineage]: table_lineage_list: List[TableLineage] = [] if table_lineage_result is None: return table_lineage_list for table_lineage in table_lineage_result: + # Join the column lineage into the table lineage. column_lineage_result = column_lineage_metadata.get( - f"{table_lineage[Constant.SOURCE_TABLE_ID]}-{table_lineage[Constant.DESTINATION_TABLE_ID]}" + ( + table_lineage[Constant.SOURCE_TABLE_ID], + table_lineage[Constant.DESTINATION_TABLE_ID], + ) ) column_lineage_list: List[ColumnLineage] = [] if column_lineage_result: @@ -152,6 +145,7 @@ def _get_table_lineage( ) for column_lineage in column_lineage_result ] + table_lineage_list.append( TableLineage( source_table=f"{table_lineage[Constant.SOURCE_SCHEMA_NAME]}.{table_lineage[Constant.SOURCE_TABLE_NAME]}", @@ -167,14 +161,9 @@ def _get_all_connector_sync_logs( ) -> Dict[str, Dict[str, Dict[str, Tuple[float, Optional[str]]]]]: sync_logs: Dict[str, Dict[str, Dict[str, Tuple[float, Optional[str]]]]] = {} - # Format connector_ids as a comma-separated string of quoted IDs - formatted_connector_ids = ", ".join(f"'{id}'" for id in connector_ids) - - query = self.fivetran_log_query.get_sync_logs_query().format( - db_clause=self.fivetran_log_query.db_clause, + query = self.fivetran_log_query.get_sync_logs_query( syncs_interval=syncs_interval, - max_jobs_per_connector=MAX_JOBS_PER_CONNECTOR, - connector_ids=formatted_connector_ids, + connector_ids=connector_ids, ) for row in self._query(query): @@ -234,13 +223,13 @@ def get_user_email(self, user_id: str) -> Optional[str]: return None return self._get_users().get(user_id) - def _fill_connectors_table_lineage(self, connectors: List[Connector]) -> None: - table_lineage_metadata = self._get_connectors_table_lineage_metadata() + def _fill_connectors_lineage(self, connectors: List[Connector]) -> None: + table_lineage_metadata = self._get_table_lineage_metadata() column_lineage_metadata = self._get_column_lineage_metadata() for connector in connectors: - connector.table_lineage = self._get_table_lineage( - column_lineage_metadata=column_lineage_metadata, + connector.lineage = self._extract_connector_lineage( table_lineage_result=table_lineage_metadata.get(connector.connector_id), + column_lineage_metadata=column_lineage_metadata, ) def _fill_connectors_jobs( @@ -262,6 +251,7 @@ def get_allowed_connectors_list( ) -> List[Connector]: connectors: List[Connector] = [] with report.metadata_extraction_perf.connectors_metadata_extraction_sec: + logger.info("Fetching connector list") connector_list = self._query(self.fivetran_log_query.get_connectors_query()) for connector in connector_list: if not connector_patterns.allowed(connector[Constant.CONNECTOR_NAME]): @@ -279,12 +269,20 @@ def get_allowed_connectors_list( sync_frequency=connector[Constant.SYNC_FREQUENCY], destination_id=connector[Constant.DESTINATION_ID], user_id=connector[Constant.CONNECTING_USER_ID], - table_lineage=[], - jobs=[], + lineage=[], # filled later + jobs=[], # filled later ) ) + + if not connectors: + # Some of our queries don't work well when there's no connectors, since + # we push down connector id filters. + return [] + with report.metadata_extraction_perf.connectors_lineage_extraction_sec: - self._fill_connectors_table_lineage(connectors) + logger.info("Fetching connector lineage") + self._fill_connectors_lineage(connectors) with report.metadata_extraction_perf.connectors_jobs_extraction_sec: + logger.info("Fetching connector job run history") self._fill_connectors_jobs(connectors, syncs_interval) return connectors diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py index c4680b4b1037a2..c9e329b706768f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py +++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py @@ -1,3 +1,11 @@ +from typing import List + +# Safeguards to prevent fetching massive amounts of data. +MAX_TABLE_LINEAGE_PER_CONNECTOR = 100 +MAX_COLUMN_LINEAGE_PER_CONNECTOR = 3000 +MAX_JOBS_PER_CONNECTOR = 1000 + + class FivetranLogQuery: # Note: All queries are written in Snowflake SQL. # They will be transpiled to the target database's SQL dialect at runtime. @@ -24,69 +32,88 @@ def get_connectors_query(self) -> str: destination_id FROM {self.db_clause}connector WHERE - _fivetran_deleted = FALSE\ + _fivetran_deleted = FALSE """ def get_users_query(self) -> str: - return f""" - SELECT id as user_id, - given_name, - family_name, - email - FROM {self.db_clause}user""" + return f"""\ +SELECT id as user_id, +given_name, +family_name, +email +FROM {self.db_clause}user +""" - def get_sync_logs_query(self) -> str: - return """ - WITH ranked_syncs AS ( - SELECT - connector_id, - sync_id, - MAX(CASE WHEN message_event = 'sync_start' THEN time_stamp END) as start_time, - MAX(CASE WHEN message_event = 'sync_end' THEN time_stamp END) as end_time, - MAX(CASE WHEN message_event = 'sync_end' THEN message_data END) as end_message_data, - ROW_NUMBER() OVER (PARTITION BY connector_id ORDER BY MAX(time_stamp) DESC) as rn - FROM {db_clause}log - WHERE message_event in ('sync_start', 'sync_end') - AND time_stamp > CURRENT_TIMESTAMP - INTERVAL '{syncs_interval} days' - AND connector_id IN ({connector_ids}) - GROUP BY connector_id, sync_id - ) - SELECT - connector_id, - sync_id, - start_time, - end_time, - end_message_data - FROM ranked_syncs - WHERE rn <= {max_jobs_per_connector} - AND start_time IS NOT NULL - AND end_time IS NOT NULL - ORDER BY connector_id, end_time DESC - """ + def get_sync_logs_query( + self, + syncs_interval: int, + connector_ids: List[str], + ) -> str: + # Format connector_ids as a comma-separated string of quoted IDs + formatted_connector_ids = ", ".join(f"'{id}'" for id in connector_ids) + + return f"""\ +WITH ranked_syncs AS ( + SELECT + connector_id, + sync_id, + MAX(CASE WHEN message_event = 'sync_start' THEN time_stamp END) as start_time, + MAX(CASE WHEN message_event = 'sync_end' THEN time_stamp END) as end_time, + MAX(CASE WHEN message_event = 'sync_end' THEN message_data END) as end_message_data, + ROW_NUMBER() OVER (PARTITION BY connector_id ORDER BY MAX(time_stamp) DESC) as rn + FROM {self.db_clause}log + WHERE message_event in ('sync_start', 'sync_end') + AND time_stamp > CURRENT_TIMESTAMP - INTERVAL '{syncs_interval} days' + AND connector_id IN ({formatted_connector_ids}) + GROUP BY connector_id, sync_id +) +SELECT + connector_id, + sync_id, + start_time, + end_time, + end_message_data +FROM ranked_syncs +WHERE rn <= {MAX_JOBS_PER_CONNECTOR} + AND start_time IS NOT NULL + AND end_time IS NOT NULL +ORDER BY connector_id, end_time DESC +""" def get_table_lineage_query(self) -> str: - return f""" - SELECT stm.connector_id as connector_id, - stm.id as source_table_id, - stm.name as source_table_name, - ssm.name as source_schema_name, - dtm.id as destination_table_id, - dtm.name as destination_table_name, - dsm.name as destination_schema_name - FROM {self.db_clause}table_lineage as tl - JOIN {self.db_clause}source_table_metadata as stm on tl.source_table_id = stm.id - JOIN {self.db_clause}destination_table_metadata as dtm on tl.destination_table_id = dtm.id - JOIN {self.db_clause}source_schema_metadata as ssm on stm.schema_id = ssm.id - JOIN {self.db_clause}destination_schema_metadata as dsm on dtm.schema_id = dsm.id""" + return f"""\ +SELECT + stm.connector_id as connector_id, + stm.id as source_table_id, + stm.name as source_table_name, + ssm.name as source_schema_name, + dtm.id as destination_table_id, + dtm.name as destination_table_name, + dsm.name as destination_schema_name +FROM {self.db_clause}table_lineage as tl +JOIN {self.db_clause}source_table_metadata as stm on tl.source_table_id = stm.id +JOIN {self.db_clause}destination_table_metadata as dtm on tl.destination_table_id = dtm.id +JOIN {self.db_clause}source_schema_metadata as ssm on stm.schema_id = ssm.id +JOIN {self.db_clause}destination_schema_metadata as dsm on dtm.schema_id = dsm.id +QUALIFY ROW_NUMBER() OVER (PARTITION BY stm.connector_id ORDER BY tl.created_at DESC) <= {MAX_TABLE_LINEAGE_PER_CONNECTOR} +ORDER BY stm.connector_id, tl.created_at DESC +""" def get_column_lineage_query(self) -> str: - return f""" - SELECT scm.table_id as source_table_id, - dcm.table_id as destination_table_id, - scm.name as source_column_name, - dcm.name as destination_column_name - FROM {self.db_clause}column_lineage as cl - JOIN {self.db_clause}source_column_metadata as scm - on cl.source_column_id = scm.id - JOIN {self.db_clause}destination_column_metadata as dcm - on cl.destination_column_id = dcm.id""" + return f"""\ +SELECT + scm.table_id as source_table_id, + dcm.table_id as destination_table_id, + scm.name as source_column_name, + dcm.name as destination_column_name +FROM {self.db_clause}column_lineage as cl +JOIN {self.db_clause}source_column_metadata as scm + ON cl.source_column_id = scm.id +JOIN {self.db_clause}destination_column_metadata as dcm + ON cl.destination_column_id = dcm.id +-- Only joining source_table_metadata to get the connector_id. +JOIN {self.db_clause}source_table_metadata as stm + ON scm.table_id = stm.id +QUALIFY ROW_NUMBER() OVER (PARTITION BY stm.connector_id ORDER BY cl.created_at DESC) <= {MAX_COLUMN_LINEAGE_PER_CONNECTOR} +ORDER BY stm.connector_id, cl.created_at DESC +""" diff --git a/metadata-ingestion/tests/integration/fivetran/test_fivetran.py b/metadata-ingestion/tests/integration/fivetran/test_fivetran.py index 33ac09e69a3c0a..e72162b12e48fd 100644 --- a/metadata-ingestion/tests/integration/fivetran/test_fivetran.py +++ b/metadata-ingestion/tests/integration/fivetran/test_fivetran.py @@ -100,11 +100,9 @@ def default_query_results( "email": "abc.xyz@email.com", } ] - elif query == fivetran_log_query.get_sync_logs_query().format( - db_clause=fivetran_log_query.db_clause, + elif query == fivetran_log_query.get_sync_logs_query( syncs_interval=7, - max_jobs_per_connector=1000, - connector_ids="'calendar_elected'", + connector_ids=["calendar_elected"], ): return [ { From ed7c368e1dec1675cf7fa2cada57c41a70fd385c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20L=C3=BCdin?= <13187726+Masterchen09@users.noreply.github.com> Date: Mon, 21 Oct 2024 09:00:09 +0200 Subject: [PATCH 16/31] fix(ui): show DataHub logo for DataHub sources in ingestion souces list (#11658) Co-authored-by: Shirshanka Das --- .../src/app/ingest/source/builder/constants.ts | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/datahub-web-react/src/app/ingest/source/builder/constants.ts b/datahub-web-react/src/app/ingest/source/builder/constants.ts index b67ca388c10546..0e0ba8b22e37ef 100644 --- a/datahub-web-react/src/app/ingest/source/builder/constants.ts +++ b/datahub-web-react/src/app/ingest/source/builder/constants.ts @@ -35,6 +35,7 @@ import csvLogo from '../../../../images/csv-logo.png'; import qlikLogo from '../../../../images/qliklogo.png'; import sigmaLogo from '../../../../images/sigmalogo.png'; import sacLogo from '../../../../images/saclogo.svg'; +import datahubLogo from '../../../../images/datahublogo.png'; export const ATHENA = 'athena'; export const ATHENA_URN = `urn:li:dataPlatform:${ATHENA}`; @@ -125,6 +126,11 @@ export const SIGMA = 'sigma'; export const SIGMA_URN = `urn:li:dataPlatform:${SIGMA}`; export const SAC = 'sac'; export const SAC_URN = `urn:li:dataPlatform:${SAC}`; +export const DATAHUB = 'datahub'; +export const DATAHUB_GC = 'datahub-gc'; +export const DATAHUB_LINEAGE_FILE = 'datahub-lineage-file'; +export const DATAHUB_BUSINESS_GLOSSARY = 'datahub-business-glossary'; +export const DATAHUB_URN = `urn:li:dataPlatform:${DATAHUB}`; export const PLATFORM_URN_TO_LOGO = { [ATHENA_URN]: athenaLogo, @@ -165,6 +171,7 @@ export const PLATFORM_URN_TO_LOGO = { [QLIK_SENSE_URN]: qlikLogo, [SIGMA_URN]: sigmaLogo, [SAC_URN]: sacLogo, + [DATAHUB_URN]: datahubLogo, }; export const SOURCE_TO_PLATFORM_URN = { @@ -178,5 +185,7 @@ export const SOURCE_TO_PLATFORM_URN = { [SNOWFLAKE_USAGE]: SNOWFLAKE_URN, [STARBURST_TRINO_USAGE]: TRINO_URN, [DBT_CLOUD]: DBT_URN, - [VERTICA]: VERTICA_URN, + [DATAHUB_GC]: DATAHUB_URN, + [DATAHUB_LINEAGE_FILE]: DATAHUB_URN, + [DATAHUB_BUSINESS_GLOSSARY]: DATAHUB_URN, }; From 6bcf5a9594316635aec408e3f4fc6021e45653dd Mon Sep 17 00:00:00 2001 From: kbartlett Date: Tue, 19 Nov 2024 13:37:39 -0500 Subject: [PATCH 17/31] feat: connector for Neo4j --- .../app/ingest/source/builder/constants.ts | 4 + .../app/ingest/source/builder/sources.json | 8 ++ .../docs/sources/neo4j/neo4j.md | 126 ------------------ .../docs/sources/neo4j/neo4j_recipe.yml | 4 +- .../recipes/neo4j_to_datahub.dhub.yaml | 15 --- .../ingestion/source/common/subtypes.py | 2 + .../ingestion/source/neo4j/neo4j_source.py | 101 +++++--------- .../bootstrap_mcps/data-platforms.yaml | 11 ++ 8 files changed, 62 insertions(+), 209 deletions(-) delete mode 100644 metadata-ingestion/examples/recipes/neo4j_to_datahub.dhub.yaml diff --git a/datahub-web-react/src/app/ingest/source/builder/constants.ts b/datahub-web-react/src/app/ingest/source/builder/constants.ts index 7b7b880069f676..9edcbd7174ab92 100644 --- a/datahub-web-react/src/app/ingest/source/builder/constants.ts +++ b/datahub-web-react/src/app/ingest/source/builder/constants.ts @@ -37,6 +37,7 @@ import qlikLogo from '../../../../images/qliklogo.png'; import sigmaLogo from '../../../../images/sigmalogo.png'; import sacLogo from '../../../../images/saclogo.svg'; import datahubLogo from '../../../../images/datahublogo.png'; +import neo4j from '../../../../images/neo4j.png' export const ATHENA = 'athena'; export const ATHENA_URN = `urn:li:dataPlatform:${ATHENA}`; @@ -134,6 +135,8 @@ export const DATAHUB_GC = 'datahub-gc'; export const DATAHUB_LINEAGE_FILE = 'datahub-lineage-file'; export const DATAHUB_BUSINESS_GLOSSARY = 'datahub-business-glossary'; export const DATAHUB_URN = `urn:li:dataPlatform:${DATAHUB}`; +export const NEO4J = "neo4j" +export const NEO4J_URN = `urn:li:dataPlatform:${NEO4J}` export const PLATFORM_URN_TO_LOGO = { [ATHENA_URN]: athenaLogo, @@ -176,6 +179,7 @@ export const PLATFORM_URN_TO_LOGO = { [SIGMA_URN]: sigmaLogo, [SAC_URN]: sacLogo, [DATAHUB_URN]: datahubLogo, + [NEO4J_URN]: neo4j }; export const SOURCE_TO_PLATFORM_URN = { diff --git a/datahub-web-react/src/app/ingest/source/builder/sources.json b/datahub-web-react/src/app/ingest/source/builder/sources.json index 9f54fe23631bcd..9830259899f0d3 100644 --- a/datahub-web-react/src/app/ingest/source/builder/sources.json +++ b/datahub-web-react/src/app/ingest/source/builder/sources.json @@ -310,5 +310,13 @@ "description": "Import Spaces, Sources, Tables and statistics from Dremio.", "docsUrl": "https://datahubproject.io/docs/metadata-ingestion/", "recipe": "source:\n type: dremio\n config:\n # Coordinates\n hostname: null\n port: null\n #true if https, otherwise false\n tls: true\n\n #For cloud instance\n #is_dremio_cloud: True\n #dremio_cloud_project_id: \n\n #Credentials with personal access token\n authentication_method: PAT\n password: pass\n\n #Or Credentials with basic auth\n #authentication_method: password\n #username: null\n #password: null\n\n stateful_ingestion:\n enabled: true" + }, + { + "urn": "urn:li:dataPlatform:neo4j", + "name": "neo4j", + "displayName": "Neo4j", + "description": "Import Nodes and Relationships from Neo4j.", + "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/neo4j/", + "recipe": "source:\n type: 'neo4j'\n config:\n uri: 'neo4j+ssc://host:7687'\n username: 'neo4j'\n password: 'password'\n env: 'PROD'\n\nsink:\n type: \"datahub-rest\"\n config:\n server: 'http://localhost:8080'" } ] diff --git a/metadata-ingestion/docs/sources/neo4j/neo4j.md b/metadata-ingestion/docs/sources/neo4j/neo4j.md index 7bd3cf470e330a..d4dab2c6c7e1f2 100644 --- a/metadata-ingestion/docs/sources/neo4j/neo4j.md +++ b/metadata-ingestion/docs/sources/neo4j/neo4j.md @@ -18,129 +18,3 @@ In order to ingest metadata from Neo4j, you will need: * Neo4j instance with APOC installed - -### Install the Plugin(s) - -Run the following commands to install the relevant plugin(s): - - -Use the following recipe(s) to get started with ingestion. - -
- View All Recipe Configuartion Options - - | Field | Required | Default | Description | - |--------------------|:--------:|:---------------:|---------------------------------------| - | source | | | | - | `type` | ✅ | `neo4j` | A required field with a default value | - | config | | | | - | `uri` | ✅ | `default_value` | The URI for the Neo4j server | - | `username` | ✅ | None | Neo4j Username | - | `password` | ✅ | None | Neo4j Password - | `gms_server` | ✅ | None |Address for the gms server| - | `node_tag` | ❌ | `Node` |The tag that will be used to show that the Neo4j object is a Node| - | `relationship_tag` | ❌ | `Relationship` |The tag that will be used to show that the Neo4j object is a Relationship| - | `environment` | ✅ | None || - | sink | | || - | `type` | ✅ | None || - | conifg | | || - | `server` | ✅ | None || - -
- - -```yml -source: - type: 'neo4j' - config: - uri: 'neo4j+ssc://host:7687' - username: 'neo4j' - password: 'password' - gms_server: &gms_server 'http://localhost:8080' - node_tag: 'Node' - relationship_tag: 'Relationship' - environment: 'PROD' - -sink: - type: "datahub-rest" - config: - server: *gms_server -``` - - - -### Sample data that is returned from Neo4j. This is the data that is parsed and used to create Nodes, Relationships. - - - Example relationship: - { - relationship_name: { - count: 1, - properties: {}, - type: "relationship" - } - } - - Example node: - { - key: Neo4j_Node, - value: { - count: 10, - labels: [], - properties: { - node_id: { - unique: true, - indexed: true, - type: "STRING", - existence: false - }, - node_name: { - unique: false, - indexed: false, - type: "STRING", - existence: false - } - }, - type: "node", - relationships: { - RELATIONSHIP_1: { - count: 10, - direction: "in", - labels: ["Node_1", "Node_2", "Node_3"], - properties: { - relationsip_name: { - indexed: false, - type: "STRING", - existence: false, - array: false - }, - relationship_id: { - indexed: false, - type: "INTEGER", - existence: false, - array: false - } - } - }, - RELATIONSHIP_2: { - count: 10, - direction: "out", - labels: ["Node_4"], - properties: { - relationship_name: { - indexed: false, - type: "STRING", - existence: false, - array: false - }, - relationship_id: { - indexed: false, - type: "INTEGER", - existence: false, - array: false - } - } - } - } - } - } diff --git a/metadata-ingestion/docs/sources/neo4j/neo4j_recipe.yml b/metadata-ingestion/docs/sources/neo4j/neo4j_recipe.yml index 61778ef3decef6..463d65e7ba323b 100644 --- a/metadata-ingestion/docs/sources/neo4j/neo4j_recipe.yml +++ b/metadata-ingestion/docs/sources/neo4j/neo4j_recipe.yml @@ -4,9 +4,7 @@ source: uri: 'neo4j+ssc://host:7687' username: 'neo4j' password: 'password' - node_tag: 'Node' - relationship_tag: 'Relationship' - environment: 'PROD' + env: 'PROD' sink: type: "datahub-rest" diff --git a/metadata-ingestion/examples/recipes/neo4j_to_datahub.dhub.yaml b/metadata-ingestion/examples/recipes/neo4j_to_datahub.dhub.yaml deleted file mode 100644 index af5985b1575e2c..00000000000000 --- a/metadata-ingestion/examples/recipes/neo4j_to_datahub.dhub.yaml +++ /dev/null @@ -1,15 +0,0 @@ -source: - type: 'neo4j' - config: - uri: 'neo4j+ssc://host:7687' - username: 'neo4j' - password: 'password' - gms_server: 'http://localhost:8080' - node_tag: 'Node' - relationship_tag: 'Relationship' - environment: 'PROD' - -sink: - type: "datahub-rest" - config: - server: 'http://localhost:8080' \ No newline at end of file diff --git a/metadata-ingestion/src/datahub/ingestion/source/common/subtypes.py b/metadata-ingestion/src/datahub/ingestion/source/common/subtypes.py index 7271bf6102639f..b74f6f67510194 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/common/subtypes.py +++ b/metadata-ingestion/src/datahub/ingestion/source/common/subtypes.py @@ -22,6 +22,8 @@ class DatasetSubTypes(StrEnum): SAC_MODEL = "Model" SAC_IMPORT_DATA_MODEL = "Import Data Model" SAC_LIVE_DATA_MODEL = "Live Data Model" + NEO4J_NODE = "Neo4j Node" + NEO4J_RELATIONSHIP = "Neo4j Relationship" # TODO: Create separate entity... NOTEBOOK = "Notebook" diff --git a/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py b/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py index 2060007b2c1940..5f52c3c438e677 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py @@ -8,11 +8,7 @@ from pydantic.fields import Field from datahub.configuration.source_common import EnvConfigMixin -from datahub.emitter.mce_builder import ( - make_data_platform_urn, - make_dataset_urn, - make_tag_urn, -) +from datahub.emitter.mce_builder import make_data_platform_urn, make_dataset_urn from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.decorators import ( @@ -23,20 +19,19 @@ ) from datahub.ingestion.api.source import Source, SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit -from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph +from datahub.ingestion.source.common.subtypes import DatasetSubTypes from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaFieldDataType from datahub.metadata.schema_classes import ( AuditStampClass, BooleanTypeClass, DatasetPropertiesClass, DateTypeClass, - GlobalTagsClass, NumberTypeClass, OtherSchemaClass, SchemaFieldClass, SchemaMetadataClass, StringTypeClass, - TagAssociationClass, + SubTypesClass, UnionTypeClass, ) @@ -60,15 +55,7 @@ class Neo4jConfig(EnvConfigMixin): username: str = Field(default=None, description="Neo4j Username") password: str = Field(default=None, description="Neo4j Password") uri: str = Field(default=None, description="The URI for the Neo4j server") - environment: str = Field(default=None, description="Neo4j env") - node_tag: str = Field( - default="Node", - description="The tag that will be used to show that the Neo4j object is a Node", - ) - relationship_tag: str = Field( - default="Relationship", - description="The tag that will be used to show that the Neo4j object is a Relationship", - ) + env: str = Field(default=None, description="Neo4j env") platform: str = Field(default="neo4j", description="Neo4j platform") @@ -124,7 +111,7 @@ def add_properties( ) return MetadataChangeProposalWrapper( entityUrn=make_dataset_urn( - platform=self.config.platform, name=dataset, env=self.config.environment + platform=self.config.platform, name=dataset, env=self.config.env ), aspect=dataset_properties, ) @@ -140,7 +127,7 @@ def generate_neo4j_object( ] mcp = MetadataChangeProposalWrapper( entityUrn=make_dataset_urn( - platform=platform, name=dataset, env=self.config.environment + platform=platform, name=dataset, env=self.config.env ), aspect=SchemaMetadataClass( schemaName=dataset, @@ -154,9 +141,6 @@ def generate_neo4j_object( ), fields=fields, ), - systemMetadata=DatasetPropertiesClass( - customProperties={"properties": "property on object"} - ), ) self.report.obj_created += 1 return mcp @@ -164,32 +148,6 @@ def generate_neo4j_object( log.error(e) self.report.obj_failures += 1 - def add_tag_to_dataset( - self, table_name: str, tag_name: str - ) -> MetadataChangeProposalWrapper: - graph = DataHubGraph( - DatahubClientConfig(server=self.ctx.pipeline_config.sink.config["server"]) - ) - dataset_urn = make_dataset_urn( - platform=self.config.platform, name=table_name, env=self.config.environment - ) - current_tags: Optional[GlobalTagsClass] = graph.get_aspect( - entity_urn=dataset_urn, - aspect_type=GlobalTagsClass, - ) - tag_to_add = make_tag_urn(tag_name) - tag_association_to_add = TagAssociationClass(tag=tag_to_add) - - if current_tags: - if tag_to_add not in [x.tag for x in current_tags.tags]: - current_tags.tags.append(TagAssociationClass(tag_to_add)) - else: - current_tags = GlobalTagsClass(tags=[tag_association_to_add]) - return MetadataChangeProposalWrapper( - entityUrn=dataset_urn, - aspect=current_tags, - ) - def get_neo4j_metadata(self, query: str) -> pd.DataFrame: driver = GraphDatabase.driver( self.config.uri, auth=(self.config.username, self.config.password) @@ -211,18 +169,23 @@ def get_neo4j_metadata(self, query: str) -> pd.DataFrame: See the docs for examples of metadata: metadata-ingestion/docs/sources/neo4j/neo4j.md """ - log.info(f"{query}") - with driver.session() as session: - result = session.run(query) - data = [record for record in result] - log.info("Closing Neo4j driver") - driver.close() - - node_df = self.process_nodes(data) - rel_df = self.process_relationships(data, node_df) - - union_cols = ["key", "obj_type", "property_data_types", "description"] - df = pd.concat([node_df[union_cols], rel_df[union_cols]]) + try: + log.info(f"{query}") + with driver.session() as session: + result = session.run(query) + data = [record for record in result] + log.info("Closing Neo4j driver") + driver.close() + + node_df = self.process_nodes(data) + rel_df = self.process_relationships(data, node_df) + + union_cols = ["key", "obj_type", "property_data_types", "description"] + df = pd.concat([node_df[union_cols], rel_df[union_cols]]) + except Exception as e: + self.report.failure( + exc=e, + ) return df @@ -326,11 +289,19 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: yield MetadataWorkUnit( id=row["key"], - mcp=self.add_tag_to_dataset( - table_name=row["key"], - tag_name=self.config.node_tag - if row["obj_type"] == "node" - else self.config.relationship_tag, + mcp=MetadataChangeProposalWrapper( + entityUrn=make_dataset_urn( + platform=self.config.platform, + name=row["key"], + env=self.config.env, + ), + aspect=SubTypesClass( + typeNames=[ + DatasetSubTypes.NEO4J_NODE + if row["obj_type"] == "node" + else DatasetSubTypes.NEO4J_RELATIONSHIP + ] + ), ), ) diff --git a/metadata-service/configuration/src/main/resources/bootstrap_mcps/data-platforms.yaml b/metadata-service/configuration/src/main/resources/bootstrap_mcps/data-platforms.yaml index f480ec862bc4e2..0e3ed36d510094 100644 --- a/metadata-service/configuration/src/main/resources/bootstrap_mcps/data-platforms.yaml +++ b/metadata-service/configuration/src/main/resources/bootstrap_mcps/data-platforms.yaml @@ -717,3 +717,14 @@ displayName: Dremio type: QUERY_ENGINE logoUrl: "/assets/platforms/dremiologo.png" +- entityUrn: urn:li:dataPlatform:neo4j + entityType: dataPlatform + aspectName: dataPlatformInfo + changeType: UPSERT + aspect: + datasetNameDelimiter: "." + name: neo4j + displayName: Neo4j + type: OTHER + logoUrl: "/assets/platforms/neo4j.png" + From aadd71acbdd31913baccc7b7693cf27431dcb909 Mon Sep 17 00:00:00 2001 From: kbartlett Date: Thu, 21 Nov 2024 14:47:14 -0500 Subject: [PATCH 18/31] feat: connector for Neo4j --- metadata-ingestion/setup.py | 3 + .../tests/unit/test_neo4j_source.py | 325 +++++++++++------- 2 files changed, 197 insertions(+), 131 deletions(-) diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 2469af74b03343..cb99d4955dd0bc 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -524,6 +524,7 @@ "qlik-sense": sqlglot_lib | {"requests", "websocket-client"}, "sigma": sqlglot_lib | {"requests"}, "sac": sac, + "neo4j": {"pandas", "neo4j"}, } # This is mainly used to exclude plugins from the Docker image. @@ -668,6 +669,7 @@ "sigma", "sac", "cassandra", + "neo4j", ] if plugin for dependency in plugins[plugin] @@ -787,6 +789,7 @@ "sigma = datahub.ingestion.source.sigma.sigma:SigmaSource", "sac = datahub.ingestion.source.sac.sac:SACSource", "cassandra = datahub.ingestion.source.cassandra.cassandra:CassandraSource", + "neo4j = datahub.ingestion.source.neo4j.neo4j_source:Neo4jSource", ], "datahub.ingestion.transformer.plugins": [ "pattern_cleanup_ownership = datahub.ingestion.transformer.pattern_cleanup_ownership:PatternCleanUpOwnership", diff --git a/metadata-ingestion/tests/unit/test_neo4j_source.py b/metadata-ingestion/tests/unit/test_neo4j_source.py index 07f41a37aa36dd..3e35417e93e074 100644 --- a/metadata-ingestion/tests/unit/test_neo4j_source.py +++ b/metadata-ingestion/tests/unit/test_neo4j_source.py @@ -1,155 +1,218 @@ import unittest +from pathlib import Path import pandas as pd +import pytest from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.source.neo4j.neo4j_source import Neo4jConfig, Neo4jSource -class TestNeo4j(unittest.TestCase): - def setUp(self): - self.neo = Neo4jSource(Neo4jConfig(), PipelineContext(run_id="test")) - self.record_1 = { - "count": 1, - "labels": [], - "properties": { - "id": { - "unique": True, - "indexed": True, - "type": "STRING", - "existence": False, - }, - }, - "type": "node", - "relationships": { - "RELATIONSHIP_1": { - "count": 0, - "direction": "out", - "labels": ["Label_1"], - "properties": {}, - } - }, - } - self.record_2 = { - "count": 2, - "labels": [], - "properties": { - "id": { - "unique": True, - "indexed": True, - "type": "STRING", - "existence": False, - }, - "amount": { - "unique": True, - "indexed": True, - "type": "INTEGER", - "existence": False, - }, - }, - "type": "node", - "relationships": { - "RELATIONSHIP_1": { - "count": 0, - "direction": "out", - "labels": ["Label_1"], - "properties": {}, +@pytest.fixture +def tracking_uri(tmp_path: Path) -> str: + # return str(tmp_path / "neo4j") + return "neo4j+ssc://host:7687" + + +@pytest.fixture +def source(tracking_uri: str) -> Neo4jConfig: + return Neo4jSource( + ctx=PipelineContext(run_id="neo4j-test"), + config=Neo4jConfig(uri=tracking_uri), + ) + + +def data(): + return [ + { + "key": "Node_1", + "value": { + "count": 433026, + "relationships": { + "RELATIONSHIP_1": { + "count": 1, + "properties": { + "Relationship1_Property1": { + "existence": False, + "type": "STRING", + "indexed": False, + "array": False, + } + }, + "direction": "in", + "labels": ["Node_2"], + } }, "RELATIONSHIP_2": { - "count": 1, + "count": 2, + "properties": { + "Relationship2_Property1": { + "existence": False, + "type": "STRING", + "indexed": False, + "array": False, + } + }, "direction": "in", - "labels": ["Label_1", "Label_2"], - "properties": {}, + "labels": ["Node_3"], }, + "type": "node", + "properties": { + "Node1_Property1": { + "existence": False, + "type": "DATE", + "indexed": False, + "unique": False, + }, + "Node1_Property2": { + "existence": False, + "type": "STRING", + "indexed": False, + "unique": False, + }, + "Node1_Property3": { + "existence": False, + "type": "STRING", + "indexed": False, + "unique": False, + }, + }, + "labels": [], }, - } - self.record_3 = {"count": 3, "properties": {}, "type": "relationship"} - self.record_4 = { - "RELATIONSHIP_2": { - "count": 4, - "properties": {}, - "type": "relationship", + }, + { + "key": "Node_2", + "value": { + "count": 3, "relationships": { "RELATIONSHIP_1": { - "count": 0, + "count": 1, + "properties": { + "Relationship1_Property1": { + "existence": False, + "type": "STRING", + "indexed": False, + "array": False, + } + }, "direction": "out", - "labels": ["Label_1"], - "properties": {}, + "labels": ["Node_2"], + } + }, + "type": "node", + "properties": { + "Node2_Property1": { + "existence": False, + "type": "DATE", + "indexed": False, + "unique": False, }, - "RELATIONSHIP_2": { - "count": 1, - "direction": "in", - "labels": ["Label_1", "Label_2"], - "properties": {}, + "Node2_Property2": { + "existence": False, + "type": "STRING", + "indexed": False, + "unique": False, + }, + "Node2_Property3": { + "existence": False, + "type": "STRING", + "indexed": False, + "unique": False, }, }, - } - } - - def create_df(self): - data = { - "key": ["item1", "item2", "item3", "RELATIONSHIP_2"], - "value": [ - self.record_1, - self.record_2, - self.record_3, - self.record_4, - ], - } - df = pd.DataFrame(data) - return df - - def test_get_obj_type(self): - assert self.neo.get_obj_type(self.record_1) == "node" - assert self.neo.get_obj_type(self.record_2) == "node" - assert self.neo.get_obj_type(self.record_3) == "relationship" - - def test_get_relationships(self): - assert self.neo.get_relationships(self.record_1, self.create_df()) == { - "RELATIONSHIP_1": { - "count": 0, - "direction": "out", - "labels": ["Label_1"], - "properties": {}, - } - } - assert self.neo.get_relationships(self.record_2, self.create_df()) == { - "RELATIONSHIP_1": { - "count": 0, - "direction": "out", - "labels": ["Label_1"], - "properties": {}, - }, - "RELATIONSHIP_2": { - "count": 1, - "direction": "in", - "labels": ["Label_1", "Label_2"], - "properties": {}, + "labels": [], }, - } - assert self.neo.get_relationships(self.record_3, self.create_df()) is None - - def test_get_property_data_types(self): - record_1 = self.record_1.get("properties", None) - record_2 = self.record_2.get("properties", None) - assert self.neo.get_property_data_types(record_1) == [{"id": "STRING"}] - assert self.neo.get_property_data_types(record_2) == [ - {"id": "STRING"}, - {"amount": "INTEGER"}, - ] - - def test_get_properties(self): - assert self.neo.get_properties(self.record_1) == { - "id": { - "unique": True, - "indexed": True, - "type": "STRING", - "existence": False, + }, + { + "key": "RELATIONSHIP_1", + "value": { + "count": 4, + "type": "relationship", + "properties": { + "Relationship1_Property1": { + "existence": False, + "type": "STRING", + "indexed": False, + "array": False, + } + }, }, - } - assert self.neo.get_properties(self.record_2) == self.record_2.get( - "properties", None - ) + }, + ] + + +def test_process_nodes(source): + df = source.process_nodes(data=data()) + assert type(df) is pd.DataFrame + + +def test_process_relationships(source): + df = source.process_relationships( + data=data(), node_df=source.process_nodes(data=data()) + ) + assert type(df) is pd.DataFrame + + +def test_get_obj_type(source): + results = data() + assert source.get_obj_type(results[0]["value"]) == "node" + assert source.get_obj_type(results[1]["value"]) == "node" + assert source.get_obj_type(results[2]["value"]) == "relationship" + + +def test_get_node_description(source): + results = data() + df = source.process_nodes(data=data()) + assert ( + source.get_node_description(results[0], df) + == "(Node_1)<-[RELATIONSHIP_1]-(Node_2)" + ) + assert ( + source.get_node_description(results[1], df) + == "(Node_2)-[RELATIONSHIP_1]->(Node_2)" + ) + + +def test_get_property_data_types(source): + results = data() + assert source.get_property_data_types(results[0]["value"]["properties"]) == [ + {"Node1_Property1": "DATE"}, + {"Node1_Property2": "STRING"}, + {"Node1_Property3": "STRING"}, + ] + assert source.get_property_data_types(results[1]["value"]["properties"]) == [ + {"Node2_Property1": "DATE"}, + {"Node2_Property2": "STRING"}, + {"Node2_Property3": "STRING"}, + ] + assert source.get_property_data_types(results[2]["value"]["properties"]) == [ + {"Relationship1_Property1": "STRING"} + ] + + +def test_get_properties(source): + results = data() + assert list(source.get_properties(results[0]["value"]).keys()) == [ + "Node1_Property1", + "Node1_Property2", + "Node1_Property3", + ] + assert list(source.get_properties(results[1]["value"]).keys()) == [ + "Node2_Property1", + "Node2_Property2", + "Node2_Property3", + ] + assert list(source.get_properties(results[2]["value"]).keys()) == [ + "Relationship1_Property1" + ] + + +def test_get_relationships(source): + results = data() + record = list( + results[0]["value"]["relationships"].keys() + ) # Get the first key from the dict_keys + assert record == ["RELATIONSHIP_1"] if __name__ == "__main__": From d18e329e0c0ab06a5370cb7c7f92963f09fca29d Mon Sep 17 00:00:00 2001 From: kbartlett Date: Mon, 25 Nov 2024 17:51:35 -0500 Subject: [PATCH 19/31] feat: connector for Neo4j --- .../ingestion/source/neo4j/neo4j_source.py | 19 ++++++++++--------- .../bootstrap_mcps/data-platforms.yaml | 2 +- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py b/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py index 5f52c3c438e677..cb5e0705e45759 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py @@ -52,10 +52,10 @@ class Neo4jConfig(EnvConfigMixin): - username: str = Field(default=None, description="Neo4j Username") - password: str = Field(default=None, description="Neo4j Password") - uri: str = Field(default=None, description="The URI for the Neo4j server") - env: str = Field(default=None, description="Neo4j env") + username: str = Field(description="Neo4j Username") + password: str = Field(description="Neo4j Password") + uri: str = Field(description="The URI for the Neo4j server") + env: str = Field(description="Neo4j env") platform: str = Field(default="neo4j", description="Neo4j platform") @@ -84,7 +84,7 @@ def get_field_type(self, attribute_type: Union[type, str]) -> SchemaFieldDataTyp return SchemaFieldDataType(type=type_class()) def get_schema_field_class( - self, col_name: str, col_type: str, **kwargs + self, col_name: str, col_type: str, **kwargs: dict[str, str] ) -> SchemaFieldClass: if kwargs["obj_type"] == "node" and col_type == "relationship": col_type = "node" @@ -103,7 +103,7 @@ def get_schema_field_class( ) def add_properties( - self, dataset: str, description=None, custom_properties=None + self, dataset: str, description: Optional[str] = None, custom_properties: Optional[str]=None ) -> MetadataChangeProposalWrapper: dataset_properties = DatasetPropertiesClass( description=description, @@ -117,7 +117,7 @@ def add_properties( ) def generate_neo4j_object( - self, platform: str, dataset: str, columns: list, obj_type=None + self, platform: str, dataset: str, columns: list, obj_type: Optional[str]=None ) -> MetadataChangeProposalWrapper: try: fields = [ @@ -184,12 +184,13 @@ def get_neo4j_metadata(self, query: str) -> pd.DataFrame: df = pd.concat([node_df[union_cols], rel_df[union_cols]]) except Exception as e: self.report.failure( + message="Failed to get neo4j metadata", exc=e, ) return df - def process_nodes(self, data): + def process_nodes(self, data: list): nodes = [record for record in data if record["value"]["type"] == "node"] node_df = pd.DataFrame( nodes, @@ -212,7 +213,7 @@ def process_nodes(self, data): ) return node_df - def process_relationships(self, data, node_df): + def process_relationships(self, data: list, node_df: pd.DataFrame): rels = [record for record in data if record["value"]["type"] == "relationship"] rel_df = pd.DataFrame(rels, columns=["key", "value"]) rel_df["obj_type"] = rel_df["value"].apply( diff --git a/metadata-service/configuration/src/main/resources/bootstrap_mcps/data-platforms.yaml b/metadata-service/configuration/src/main/resources/bootstrap_mcps/data-platforms.yaml index 5865322de7442e..0b3d815c710980 100644 --- a/metadata-service/configuration/src/main/resources/bootstrap_mcps/data-platforms.yaml +++ b/metadata-service/configuration/src/main/resources/bootstrap_mcps/data-platforms.yaml @@ -735,6 +735,6 @@ datasetNameDelimiter: "." name: neo4j displayName: Neo4j - type: OTHER + type: OTHERS logoUrl: "/assets/platforms/neo4j.png" From 92d9d6557fadf579093a73475056e5ead7c54e8f Mon Sep 17 00:00:00 2001 From: kbartlett Date: Mon, 25 Nov 2024 19:59:46 -0500 Subject: [PATCH 20/31] feat: connector for Neo4j --- .../src/datahub/ingestion/source/neo4j/neo4j_source.py | 7 +++++-- metadata-ingestion/tests/unit/test_neo4j_source.py | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py b/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py index cb5e0705e45759..995b094b71c926 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py @@ -103,7 +103,10 @@ def get_schema_field_class( ) def add_properties( - self, dataset: str, description: Optional[str] = None, custom_properties: Optional[str]=None + self, + dataset: str, + description: Optional[str] = None, + custom_properties: Optional[str] = None, ) -> MetadataChangeProposalWrapper: dataset_properties = DatasetPropertiesClass( description=description, @@ -117,7 +120,7 @@ def add_properties( ) def generate_neo4j_object( - self, platform: str, dataset: str, columns: list, obj_type: Optional[str]=None + self, platform: str, dataset: str, columns: list, obj_type: Optional[str] = None ) -> MetadataChangeProposalWrapper: try: fields = [ diff --git a/metadata-ingestion/tests/unit/test_neo4j_source.py b/metadata-ingestion/tests/unit/test_neo4j_source.py index 3e35417e93e074..1d43c042917965 100644 --- a/metadata-ingestion/tests/unit/test_neo4j_source.py +++ b/metadata-ingestion/tests/unit/test_neo4j_source.py @@ -18,7 +18,7 @@ def tracking_uri(tmp_path: Path) -> str: def source(tracking_uri: str) -> Neo4jConfig: return Neo4jSource( ctx=PipelineContext(run_id="neo4j-test"), - config=Neo4jConfig(uri=tracking_uri), + config=Neo4jConfig(uri=tracking_uri, env='Prod', username='test', password='test'), ) From 872a4ac1675a17adf4459ac8c7e4f022481b7f33 Mon Sep 17 00:00:00 2001 From: kbartlett Date: Mon, 25 Nov 2024 20:01:08 -0500 Subject: [PATCH 21/31] feat: connector for Neo4j --- metadata-ingestion/tests/unit/test_neo4j_source.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/metadata-ingestion/tests/unit/test_neo4j_source.py b/metadata-ingestion/tests/unit/test_neo4j_source.py index 1d43c042917965..bd736fb30b1ea4 100644 --- a/metadata-ingestion/tests/unit/test_neo4j_source.py +++ b/metadata-ingestion/tests/unit/test_neo4j_source.py @@ -18,7 +18,9 @@ def tracking_uri(tmp_path: Path) -> str: def source(tracking_uri: str) -> Neo4jConfig: return Neo4jSource( ctx=PipelineContext(run_id="neo4j-test"), - config=Neo4jConfig(uri=tracking_uri, env='Prod', username='test', password='test'), + config=Neo4jConfig( + uri=tracking_uri, env="Prod", username="test", password="test" + ), ) From c4f6953eacece79c7d1e109d25d4a07eb67c4fd4 Mon Sep 17 00:00:00 2001 From: kbartlett Date: Mon, 25 Nov 2024 21:09:18 -0500 Subject: [PATCH 22/31] feat: connector for Neo4j --- .../src/datahub/ingestion/source/neo4j/neo4j_source.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py b/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py index 995b094b71c926..6f4d7c259bdc81 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py @@ -1,7 +1,7 @@ import logging import time from dataclasses import dataclass -from typing import Dict, Iterable, Optional, Type, Union +from typing import Any, Dict, Iterable, Optional, Type, Union import pandas as pd from neo4j import GraphDatabase @@ -84,7 +84,7 @@ def get_field_type(self, attribute_type: Union[type, str]) -> SchemaFieldDataTyp return SchemaFieldDataType(type=type_class()) def get_schema_field_class( - self, col_name: str, col_type: str, **kwargs: dict[str, str] + self, col_name: str, col_type: str, **kwargs: Any ) -> SchemaFieldClass: if kwargs["obj_type"] == "node" and col_type == "relationship": col_type = "node" @@ -193,7 +193,7 @@ def get_neo4j_metadata(self, query: str) -> pd.DataFrame: return df - def process_nodes(self, data: list): + def process_nodes(self, data: list) -> pd.DataFrame: nodes = [record for record in data if record["value"]["type"] == "node"] node_df = pd.DataFrame( nodes, @@ -216,7 +216,7 @@ def process_nodes(self, data: list): ) return node_df - def process_relationships(self, data: list, node_df: pd.DataFrame): + def process_relationships(self, data: list, node_df: pd.DataFrame) -> pd.DataFrame: rels = [record for record in data if record["value"]["type"] == "relationship"] rel_df = pd.DataFrame(rels, columns=["key", "value"]) rel_df["obj_type"] = rel_df["value"].apply( From a818783c5893a83d1a5023922187af6e5b2edb71 Mon Sep 17 00:00:00 2001 From: kbartlett Date: Mon, 25 Nov 2024 22:05:46 -0500 Subject: [PATCH 23/31] feat: connector for Neo4j --- .../src/datahub/ingestion/source/neo4j/neo4j_source.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py b/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py index 6f4d7c259bdc81..c8a42c8073bfdb 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py @@ -1,7 +1,7 @@ import logging import time from dataclasses import dataclass -from typing import Any, Dict, Iterable, Optional, Type, Union +from typing import Any, Dict, Iterable, List, Optional, Type, Union import pandas as pd from neo4j import GraphDatabase @@ -26,6 +26,7 @@ BooleanTypeClass, DatasetPropertiesClass, DateTypeClass, + NullTypeClass, NumberTypeClass, OtherSchemaClass, SchemaFieldClass, @@ -80,7 +81,7 @@ def create(cls, config_dict, ctx): return cls(ctx, config) def get_field_type(self, attribute_type: Union[type, str]) -> SchemaFieldDataType: - type_class: Optional[type] = _type_mapping.get(attribute_type) + type_class: Optional[type] = _type_mapping.get(attribute_type, NullTypeClass) return SchemaFieldDataType(type=type_class()) def get_schema_field_class( @@ -146,10 +147,10 @@ def generate_neo4j_object( ), ) self.report.obj_created += 1 - return mcp except Exception as e: log.error(e) self.report.obj_failures += 1 + return mcp def get_neo4j_metadata(self, query: str) -> pd.DataFrame: driver = GraphDatabase.driver( @@ -267,7 +268,7 @@ def get_node_description(self, record: dict, df: pd.DataFrame) -> str: return "\n".join(descriptions) - def get_property_data_types(self, record: dict) -> list[dict]: + def get_property_data_types(self, record: dict) -> List[dict]: return [{k: v["type"]} for k, v in record.items()] def get_properties(self, record: dict) -> str: From 4212e47197d03851a33df6bd2f0ebdbf3895fd98 Mon Sep 17 00:00:00 2001 From: kbartlett Date: Tue, 26 Nov 2024 08:33:42 -0500 Subject: [PATCH 24/31] feat: connector for Neo4j --- .../ingestion/source/neo4j/neo4j_source.py | 17 ++++++++++------- .../tests/unit/test_neo4j_source.py | 2 +- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py b/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py index c8a42c8073bfdb..0b03245e5acb2e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py @@ -70,6 +70,9 @@ class Neo4jSourceReport(SourceReport): @config_class(Neo4jConfig) @support_status(SupportStatus.CERTIFIED) class Neo4jSource(Source): + NODE = "node" + RELATIONSHIP = "relationship" + def __init__(self, ctx: PipelineContext, config: Neo4jConfig): self.ctx = ctx self.config = config @@ -87,8 +90,8 @@ def get_field_type(self, attribute_type: Union[type, str]) -> SchemaFieldDataTyp def get_schema_field_class( self, col_name: str, col_type: str, **kwargs: Any ) -> SchemaFieldClass: - if kwargs["obj_type"] == "node" and col_type == "relationship": - col_type = "node" + if kwargs["obj_type"] == self.NODE and col_type == self.RELATIONSHIP: + col_type = self.NODE else: col_type = col_type return SchemaFieldClass( @@ -96,7 +99,7 @@ def get_schema_field_class( type=self.get_field_type(col_type), nativeDataType=col_type, description=col_type.upper() - if col_type in ("node", "relationship") + if col_type in (self.NODE, self.RELATIONSHIP) else col_type, lastModified=AuditStampClass( time=round(time.time() * 1000), actor="urn:li:corpuser:ingestion" @@ -195,7 +198,7 @@ def get_neo4j_metadata(self, query: str) -> pd.DataFrame: return df def process_nodes(self, data: list) -> pd.DataFrame: - nodes = [record for record in data if record["value"]["type"] == "node"] + nodes = [record for record in data if record["value"]["type"] == self.NODE] node_df = pd.DataFrame( nodes, columns=["key", "value"], @@ -218,7 +221,7 @@ def process_nodes(self, data: list) -> pd.DataFrame: return node_df def process_relationships(self, data: list, node_df: pd.DataFrame) -> pd.DataFrame: - rels = [record for record in data if record["value"]["type"] == "relationship"] + rels = [record for record in data if record["value"]["type"] == self.RELATIONSHIP] rel_df = pd.DataFrame(rels, columns=["key", "value"]) rel_df["obj_type"] = rel_df["value"].apply( lambda record: self.get_obj_type(record) @@ -277,7 +280,7 @@ def get_properties(self, record: dict) -> str: def get_relationships(self, record: dict) -> dict: return record.get("relationships", None) - def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: + def get_workunits_internal(self) -> MetadataWorkUnit: df = self.get_neo4j_metadata( "CALL apoc.meta.schema() YIELD value UNWIND keys(value) AS key RETURN key, value[key] AS value;" ) @@ -303,7 +306,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: aspect=SubTypesClass( typeNames=[ DatasetSubTypes.NEO4J_NODE - if row["obj_type"] == "node" + if row["obj_type"] == self.NODE else DatasetSubTypes.NEO4J_RELATIONSHIP ] ), diff --git a/metadata-ingestion/tests/unit/test_neo4j_source.py b/metadata-ingestion/tests/unit/test_neo4j_source.py index bd736fb30b1ea4..62586718e86067 100644 --- a/metadata-ingestion/tests/unit/test_neo4j_source.py +++ b/metadata-ingestion/tests/unit/test_neo4j_source.py @@ -15,7 +15,7 @@ def tracking_uri(tmp_path: Path) -> str: @pytest.fixture -def source(tracking_uri: str) -> Neo4jConfig: +def source(tracking_uri: str) -> Neo4jSource: return Neo4jSource( ctx=PipelineContext(run_id="neo4j-test"), config=Neo4jConfig( From b163e16789dcd90601a8571a0145667ecfccabd8 Mon Sep 17 00:00:00 2001 From: kbartlett Date: Tue, 26 Nov 2024 09:33:58 -0500 Subject: [PATCH 25/31] feat: connector for Neo4j --- .../src/datahub/ingestion/source/neo4j/neo4j_source.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py b/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py index 0b03245e5acb2e..2b99ada9183f1b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py @@ -1,7 +1,7 @@ import logging import time from dataclasses import dataclass -from typing import Any, Dict, Iterable, List, Optional, Type, Union +from typing import Any, Dict, List, Optional, Type, Union import pandas as pd from neo4j import GraphDatabase @@ -221,7 +221,9 @@ def process_nodes(self, data: list) -> pd.DataFrame: return node_df def process_relationships(self, data: list, node_df: pd.DataFrame) -> pd.DataFrame: - rels = [record for record in data if record["value"]["type"] == self.RELATIONSHIP] + rels = [ + record for record in data if record["value"]["type"] == self.RELATIONSHIP + ] rel_df = pd.DataFrame(rels, columns=["key", "value"]) rel_df["obj_type"] = rel_df["value"].apply( lambda record: self.get_obj_type(record) From 4641c9fb5bd6d37d67235dc47a242d05134e20fe Mon Sep 17 00:00:00 2001 From: kbartlett Date: Tue, 26 Nov 2024 10:43:43 -0500 Subject: [PATCH 26/31] feat: connector for Neo4j --- .../src/datahub/ingestion/source/neo4j/neo4j_source.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py b/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py index 2b99ada9183f1b..28119a2b884153 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py @@ -1,7 +1,7 @@ import logging import time from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Type, Union +from typing import Any, Dict, Iterable, List, Optional, Type, Union import pandas as pd from neo4j import GraphDatabase @@ -84,7 +84,7 @@ def create(cls, config_dict, ctx): return cls(ctx, config) def get_field_type(self, attribute_type: Union[type, str]) -> SchemaFieldDataType: - type_class: Optional[type] = _type_mapping.get(attribute_type, NullTypeClass) + type_class: type = _type_mapping.get(attribute_type, NullTypeClass) return SchemaFieldDataType(type=type_class()) def get_schema_field_class( @@ -282,7 +282,7 @@ def get_properties(self, record: dict) -> str: def get_relationships(self, record: dict) -> dict: return record.get("relationships", None) - def get_workunits_internal(self) -> MetadataWorkUnit: + def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: df = self.get_neo4j_metadata( "CALL apoc.meta.schema() YIELD value UNWIND keys(value) AS key RETURN key, value[key] AS value;" ) @@ -295,6 +295,8 @@ def get_workunits_internal(self) -> MetadataWorkUnit: dataset=row["key"], platform=self.config.platform, ), + treat_errors_as_warnings=False, + is_primary_source=True, ) yield MetadataWorkUnit( From 6aed7c7840f048dda44cc8fde8cb980036963afe Mon Sep 17 00:00:00 2001 From: kbartlett Date: Tue, 26 Nov 2024 11:23:29 -0500 Subject: [PATCH 27/31] feat: connector for Neo4j --- .../src/datahub/ingestion/source/neo4j/neo4j_source.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py b/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py index 28119a2b884153..673f5b40b7e573 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py @@ -110,7 +110,7 @@ def add_properties( self, dataset: str, description: Optional[str] = None, - custom_properties: Optional[str] = None, + custom_properties: Optional[Dict[str, str]] = None, ) -> MetadataChangeProposalWrapper: dataset_properties = DatasetPropertiesClass( description=description, @@ -295,7 +295,6 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: dataset=row["key"], platform=self.config.platform, ), - treat_errors_as_warnings=False, is_primary_source=True, ) From f71e7be796e5a201b8d61c55bb1ce9d39b68dd8f Mon Sep 17 00:00:00 2001 From: kbartlett Date: Tue, 26 Nov 2024 12:29:59 -0500 Subject: [PATCH 28/31] feat: connector for Neo4j --- .../src/datahub/ingestion/source/neo4j/neo4j_source.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py b/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py index 673f5b40b7e573..9becb8857a55df 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py @@ -290,7 +290,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: try: yield MetadataWorkUnit( id=row["key"], - mcp_raw=self.generate_neo4j_object( + mcp=self.generate_neo4j_object( columns=row["property_data_types"], dataset=row["key"], platform=self.config.platform, From fc0d90f61e2fb5fb73a8a671aad4f9d5f22d6b5b Mon Sep 17 00:00:00 2001 From: kbartlett Date: Tue, 26 Nov 2024 14:19:19 -0500 Subject: [PATCH 29/31] feat: connector for Neo4j --- .../src/app/ingest/source/builder/constants.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datahub-web-react/src/app/ingest/source/builder/constants.ts b/datahub-web-react/src/app/ingest/source/builder/constants.ts index 872abf9d23dafa..cc2d3538c631fa 100644 --- a/datahub-web-react/src/app/ingest/source/builder/constants.ts +++ b/datahub-web-react/src/app/ingest/source/builder/constants.ts @@ -38,7 +38,7 @@ import sigmaLogo from '../../../../images/sigmalogo.png'; import sacLogo from '../../../../images/saclogo.svg'; import cassandraLogo from '../../../../images/cassandralogo.png'; import datahubLogo from '../../../../images/datahublogo.png'; -import neo4j from '../../../../images/neo4j.png' +import neo4j from '../../../../images/neo4j.png'; export const ATHENA = 'athena'; export const ATHENA_URN = `urn:li:dataPlatform:${ATHENA}`; @@ -138,8 +138,8 @@ export const DATAHUB_GC = 'datahub-gc'; export const DATAHUB_LINEAGE_FILE = 'datahub-lineage-file'; export const DATAHUB_BUSINESS_GLOSSARY = 'datahub-business-glossary'; export const DATAHUB_URN = `urn:li:dataPlatform:${DATAHUB}`; -export const NEO4J = "neo4j" -export const NEO4J_URN = `urn:li:dataPlatform:${NEO4J}` +export const NEO4J = "neo4j"; +export const NEO4J_URN = `urn:li:dataPlatform:${NEO4J}`; export const PLATFORM_URN_TO_LOGO = { [ATHENA_URN]: athenaLogo, From aee4ca59c86cd610d6983b5cf3801268b66040fa Mon Sep 17 00:00:00 2001 From: kbartlett Date: Tue, 26 Nov 2024 16:57:55 -0500 Subject: [PATCH 30/31] feat: connector for Neo4j --- datahub-web-react/src/app/ingest/source/builder/constants.ts | 4 ++-- datahub-web-react/src/app/ingest/source/builder/sources.json | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/datahub-web-react/src/app/ingest/source/builder/constants.ts b/datahub-web-react/src/app/ingest/source/builder/constants.ts index cc2d3538c631fa..58525b3e88f975 100644 --- a/datahub-web-react/src/app/ingest/source/builder/constants.ts +++ b/datahub-web-react/src/app/ingest/source/builder/constants.ts @@ -138,7 +138,7 @@ export const DATAHUB_GC = 'datahub-gc'; export const DATAHUB_LINEAGE_FILE = 'datahub-lineage-file'; export const DATAHUB_BUSINESS_GLOSSARY = 'datahub-business-glossary'; export const DATAHUB_URN = `urn:li:dataPlatform:${DATAHUB}`; -export const NEO4J = "neo4j"; +export const NEO4J = 'neo4j'; export const NEO4J_URN = `urn:li:dataPlatform:${NEO4J}`; export const PLATFORM_URN_TO_LOGO = { @@ -183,7 +183,7 @@ export const PLATFORM_URN_TO_LOGO = { [SAC_URN]: sacLogo, [CASSANDRA_URN]: cassandraLogo, [DATAHUB_URN]: datahubLogo, - [NEO4J_URN]: neo4j + [NEO4J_URN]: neo4j, }; export const SOURCE_TO_PLATFORM_URN = { diff --git a/datahub-web-react/src/app/ingest/source/builder/sources.json b/datahub-web-react/src/app/ingest/source/builder/sources.json index a5eec5f3af8437..4ec2d4300aff60 100644 --- a/datahub-web-react/src/app/ingest/source/builder/sources.json +++ b/datahub-web-react/src/app/ingest/source/builder/sources.json @@ -324,6 +324,6 @@ "displayName": "Neo4j", "description": "Import Nodes and Relationships from Neo4j.", "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/neo4j/", - "recipe": "source:\n type: 'neo4j'\n config:\n uri: 'neo4j+ssc://host:7687'\n username: 'neo4j'\n password: 'password'\n env: 'PROD'\n\nsink:\n type: \"datahub-rest\"\n config:\n server: 'http://localhost:8080'" + "recipe": "source:\n type: 'neo4j'\n config:\n uri: 'neo4j+ssc://host:7687'\n username: 'neo4j'\n password: 'password'\n env: 'PROD'\n\nsink:\n type: \"datahub-rest\"\n config:\n server: 'http://localhost:8080'" } ] From 01dfc269973da6f03e6e62775fc36a8bd1324539 Mon Sep 17 00:00:00 2001 From: kbartlett Date: Fri, 29 Nov 2024 09:29:46 -0500 Subject: [PATCH 31/31] feat: connector for Neo4j --- .../datahub/ingestion/source/neo4j/neo4j_source.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py b/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py index 9becb8857a55df..2c9107b967e4f8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/neo4j/neo4j_source.py @@ -57,7 +57,6 @@ class Neo4jConfig(EnvConfigMixin): password: str = Field(description="Neo4j Password") uri: str = Field(description="The URI for the Neo4j server") env: str = Field(description="Neo4j env") - platform: str = Field(default="neo4j", description="Neo4j platform") @dataclass @@ -72,6 +71,7 @@ class Neo4jSourceReport(SourceReport): class Neo4jSource(Source): NODE = "node" RELATIONSHIP = "relationship" + PLATFORM = "neo4j" def __init__(self, ctx: PipelineContext, config: Neo4jConfig): self.ctx = ctx @@ -118,13 +118,13 @@ def add_properties( ) return MetadataChangeProposalWrapper( entityUrn=make_dataset_urn( - platform=self.config.platform, name=dataset, env=self.config.env + platform=self.PLATFORM, name=dataset, env=self.config.env ), aspect=dataset_properties, ) def generate_neo4j_object( - self, platform: str, dataset: str, columns: list, obj_type: Optional[str] = None + self, dataset: str, columns: list, obj_type: Optional[str] = None ) -> MetadataChangeProposalWrapper: try: fields = [ @@ -134,11 +134,11 @@ def generate_neo4j_object( ] mcp = MetadataChangeProposalWrapper( entityUrn=make_dataset_urn( - platform=platform, name=dataset, env=self.config.env + platform=self.PLATFORM, name=dataset, env=self.config.env ), aspect=SchemaMetadataClass( schemaName=dataset, - platform=make_data_platform_urn(platform), + platform=make_data_platform_urn(self.PLATFORM), version=0, hash="", platformSchema=OtherSchemaClass(rawSchema=""), @@ -293,7 +293,6 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: mcp=self.generate_neo4j_object( columns=row["property_data_types"], dataset=row["key"], - platform=self.config.platform, ), is_primary_source=True, ) @@ -302,7 +301,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: id=row["key"], mcp=MetadataChangeProposalWrapper( entityUrn=make_dataset_urn( - platform=self.config.platform, + platform=self.PLATFORM, name=row["key"], env=self.config.env, ),