From a808a7b2af70d3aab27845caa81601ecee922c6a Mon Sep 17 00:00:00 2001 From: wapiti08 Date: Fri, 27 Sep 2019 15:52:20 +0100 Subject: [PATCH 1/8] Add files via upload --- data/Linux/anomaly_lables.csv | Bin 0 -> 10417 bytes data/Linux/log_matrix.npy | Bin 0 -> 318256 bytes data/Linux/mal_matrix.npy | Bin 0 -> 695696 bytes 3 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 data/Linux/anomaly_lables.csv create mode 100644 data/Linux/log_matrix.npy create mode 100644 data/Linux/mal_matrix.npy diff --git a/data/Linux/anomaly_lables.csv b/data/Linux/anomaly_lables.csv new file mode 100644 index 0000000000000000000000000000000000000000..c2b8c5747e41824a22f606845524edf4d86640c0 GIT binary patch literal 10417 zcmeHtby$?^*7wjLNQWRwqaertgLJpRfFLCzImi%0N;fmW&?()apnx=z(nv~3hl+&K zT>{_W-utjO+w*?=`{VuNyUuyIZsvNJXZ={uihISq)RZtV$pP2^TmS&T0O*1@`X-|R z0K6Cg05Jd;&D6$3&ECe=fy>C&_O6Sy6@3;a+U*Pg>WTmCD*$EY#a>&IvDAcc0PEjESY!`N0NNKkoB;B;vVW_6vI{Drk`7At!rqMFV zeR`v9^00={kPTeWMN=3FW-AUO6gxtX*^a}$<6=NA!;f)7Oi%=2cWI>HE#ydvF5!p` zqd#z>zbna?sS=0xAeiPBpZN|0av{q2ddXL?thp_Z)+@tU>1Kwy+OUKXjEf->Ld^cz$}G*|KVD#sVNb z_OQt*1RXv4u5kRq!hx~DSw_2kx+B?Qt)IM^bz?d+A&y-F>|8!jg<+?ti!+&!^1O?a zes;x!JZaHx@&rv#cyu}>lSLcSaI3fLt%~(dku)A zBNX=3;qpEAU1KQB*f4_tSPFbIjYiY-Bi+|G$~VS{vIU1PNp~KB+wmsL(A6oh=8|M9>T9b6z zZW%s6ee)*EHM?GA2FJM1Rs7veY;t`X`f5;6|3-kY$MZ&yD51uvA8x&GNr=XFCCmMTWWWB7n{*K05koV*@ zoZM^3$S7xQrF5`;Fb=LZ`8lDK)5+Ndy%@8sJ#ml@jg}k2cFmh+K2{lzejW-fcMeLF zJ0^$#2myOw6|TkcO_*3KHuL$%K5H*Adx>L2AaXPzA`h|AmqAJFH*G{b&bGEYMr~i^ z-0xHw4i~E8s~}{jjSBCO9Q|13tchNF^**Uo2-;fyi>56D7a<`W52-GHKN;CHDHD+@ zH$1K3ZvJE%o>8DQ_haD6rW53wnLrY)Kx8;jRlA)MST?BzFOb(vyZ^*ZVKUg_#_7!P zhCcOrW|Qk9+(*MXIe7;v#p9TSEdx za@8L`vb)+lH-b-T=Gzq2;Z?yRW7|nJo~{x)v1Pyn=<^?@V`;!AMsRm+w49t2+oT(9 zxRaJ!z;~qNyVQnLaSSO}8$uPAvWSSv?rel(73gk6-y57=38kLtE_P+0!LliA+?AW6 zo|QCp;W9wLh*Z&WY)<0*UjV#^b)r5;P;F}uwYAxU1XLDY_-}Hj6XeAl8Uv}*`>8_a zLDVdZ>4bZ&yp=Lly<6ma%{_`^ES0L2M5k`W-5X5|0kge=@T;0rtddi(?uABF=>NO_ z=0rqWWnu#WMb`lUvfpgh(Hvq8;rjWT``mI@bZ2d@^U-?CfAM~^=4m;|s;|MBngr4{ zTGrwl|8O%k#9^_`S!^6!Tc+Gssa+fqVWtQVOxR2S z)w4y-~CK7@@8 z+UJ(mkQ>ps-)<*~tQVtske;z9e!8>uO1^Q*6W~1~*Ph+t^Wb&P&5gt*C}FFv`I{Cy zO{T98D-$J-6Zk|8`M=_L+RM8J8RZ*hebeme(3_~PhK~-(Ds$@S>5THFJL=aM3Ou<_wHSQ=ynHAP*Swleh^v4cqPPOXA!;d|D zPrFij!n9s#+T@h#WMP|nk1T9h#(kVvDD~&34=L{dY+2arbgdSqL1-{)E?npb>O zHWCckn=XWfHzro|h8;SC%2w46Xx@#ovv~ygnRjL@m0jD^;S8?8on6Sn2V6DP8Mu}Y zm696^G=$BW)88H;*br($Pz!(Ue#_8{uW0Z=hOeH8i~z}au!v( z`047mIjRUTufyIJ(u2Hr{)~01OpF73_C9+qm^IWb)zLA&0@J}>LHT9qffBWFM|lL{ z#sqm8&*PBl_FA?Kmsm6|RhV$;o)q_ka`<)}(B4NlrHqm)7rQ#km4ag`y;V}sft&5i z9&mZlaIbw;&NK*W}nqftK5uI-G(~L79Be!9!UA=~Q+5*-DMHXA=t@n`1~8 zWnM1pw;^9Lg)-Sm5PG9>w8B6m%oH-S)jp1Ea;{EhIHLaHGly7fP(;qH&EYp)r4eB4MVIMU zmO*lHk*&4b*mTJyB(P(!Oy>`igq;;w@m_g0Y44(4nLNX4Wej;==9#0z)qH}lv_qNJ zT8mj~>=ojsojI#%zAkQ%Nj1MUK02?<7QkM3opjPNxlj1fm`lWM6Hz|M*8)&|75u9C z7(%kXFGW<*i&#nOAfK1|nn1OLVHq}FkysY{3PQlQq5eCR(N|d!hm_@Uv+Dps{NRYo z(n*}C^~kPZs}>>hBO%?yXA04j8~pY>as}^FpuBQ-lHMsM7Cu%$`oV0PUl$;RxT^|C zddX!2^@>XGl?cZBzjyd(2&`gtyoY&JM+)>rD@+z2xv*^uL>tmH~=VJy@*9=#qO_yPwWjnFhaAR6|S> zS?&uWQiURM^DEyFSeqy0gD`bo3+f)mFB^`Osb}QSq$&r;@!%BQ}Q_wtN4t4MZ1fT~!V5>@KsVfHVy+j4$T=c^OUoh7oniTmCOip$*dA&PHB zXG&#LiNL3@e<`J?a8l&c-ZV@YL|Od>0mWsnF1R=Ax3X-L?C?B5(NaLMK$M1F(nwrA zlirl7D;n8g5~Mj)-yLnr#z6J}%MN1Pb~Dnm`KSiF=zIONGy}XQpA||Ctj^wC6njAp z!gY+%=;416J?~!J>)TknUGkpb{@Z6`AW-6o1IhNDVdbldT{G9peM*V}va)-xoCRP=6HS6G#a&4KwB-CM^VmBt(#$F}z+^Twyf zggk|=3^#`tu8v7*_&<%vyG;;Q>K|$s>T0Crk=TY))ij{DF=5yo3u?kFF2D@9S@U!Y z5It)TQsO_w{8dEDO6Fg*M8!IPR3=9BTSRlVv9~m~v9UalW})i(s?9)hPyS7DkY{{& zGcM^p(rmdW1jsA>&+1LC>erUv@b_NV#|s+3Np}

l0lx-EPYqF(vA|W62{#%N*;& z_WGe&@<7D?Vl5ks7%mf{5}U^jQf?S;kK~ zL*CTSENRFvfpJ7<=SVG~w=AdMHckP%)b;h9sh*{S&NaApqI2{OR$EosYhzh(_=^Pb&_R* zZHx?O^)M_!?nJ)eNK7!MbDF1LqDt(-tzr&8(WYItG@5uSw*eP?iP=iee{;lv)eP zfq6cO1}ugpugQwOb56y+UTXWHzg3TIP_dk&nl&`0kF&c<0e(6aR;T*TU4wEoIxe!W zm)9A;Jc98%F4Xd@Nnd%Mn=|uO#&Ggb<67@CU9WphM=w?3Ni!|)ina+ymiEvX)9Ep@ zA)Dzq>061EQ|`7+gRt}>+5qODLOM~pr!TddJ|>GXj<4SKY`yr{N11bfmV6{ ztJVMCv?^@Pqtt}bx77Jg{kL7(z#d}t=e_#w`8_l8;65ubUA@aSYd>BfgqYnZMa^pWtkiED`uV zMUKZQ5@|Gaw;>4w48D)-h}5CLFF`zDKDpN7)jD1hj+Y-PwlJ=UbOpXY47vW{b%BNT zmH|#V7r0`b*)ouvFgb}|GCc%IJ$s}=jq>5D*XXuDr^-_dW7Il88 z?N!=GhQvSmB50=dQOe4Q^1V&FZGzqIYYn*qQ)S4mx=Et)-seeGb0?KD{`q2zsZ@;{ z*!c6n@(l3kG-_St{aO1gqcN z(}t-i!;u^-&oHjMe<7y+X2Yv?<%tB!3Z`RmQC9GI*qVR!4j62K>`2hn{AmSrVMgL# zR&-v3K71e9wUX0Ls_b;ShMpunB$B)}i8kGwUsR8Hd$58M6Y8pq?mc$`_&TBvZf)Tet@H`_(`#{V zOq(QCsZS~NIV{a(e^5qsKgVuCZkx{KI2i#+A?6>Fw@hi8MO5gxf9(2v>Cw2se)Y4O zwZsj1x|p+lr)Se~J1brLzONq#y;G$EPmwBu_%ay=fCk*)7!^_^8du9#{($%KhNahA zOqGB{qi`eYVt0A(zgVX`I_f4=QDte_r%AxBLtxam3**&NJYzOB%-b6K`+k!X6?VpYlo{DsU}{sQki6nhag!3wik3j{d8AAEAZ} zL8gYYuRFYsM5E+Yoj)~6_?cBgKgf@jrsihbpiJ!Vk*z`#$h3?4Mj7RW_$W91ZBPGW zAOEj={wMq8@9!QO-4AX7611-?)JcCm{ti>X&CIP;R;Xooc=H|e8eK(LHnYV39vs8( zzRBdk-eSMF-ptwp^dTLsF&DU{=C))Bud(~6g-c<&7Ey*V<5MVk2TeGwk@obAm4Qo6 zY_jp_@ypVX5Sp6UA56xe8>-~^n=AIYQNRaXM&<)C)Gr}ZvGbwuW3)0uTSvF3={$F-@u0?mb3n8?_jr zBWdV|6@b$+3=Us1XFD*3dg}|*;D%>nVF|e?^Z4~!0bcy*4S4XDK8~Osk-VgsvQ!^W zHug+^q*o5KaO=Qw_eiWL>AvVxFop$t6PxopKRcDDo(Qh*pPe|jjVSu-qUh6(@|yW) z!fR=dB$&A|M483d(#sa(cS-qV!3)qQd9rU=;gk+XYF%Q&^ZAY zXXSq#(Yfqu{}S@=13PL;=hXjx9^!(20HvnO(-FT>U;OXO;{q4d8>oEd9}W-v3RDt> z^v4Xs1*kKsHT!=D{W%Ko&p}WA5$Mm8{mY=2kLE8p^+zRaC{8a9>MxUCJ|VpzpKiz(e(?Q2>7~DJ~OV?wVc@XQGB;ekcB05A`z2yvS2tM)_sG;WFUmg8c$ek?MZ{yeQu<170qFF94NM?aRN) z{7)QS6vLMR|5{95zyScN)IS0LO=)=<=dVS^pK-iV$M?S|IW7bIHU0TB04p5;@TX+x j55j*J5iUrT8GjKU)ReGLGX($uDeC74<+Yhi=Xd`Hcke6D literal 0 HcmV?d00001 diff --git a/data/Linux/log_matrix.npy b/data/Linux/log_matrix.npy new file mode 100644 index 0000000000000000000000000000000000000000..5e371a5dd30336c7b24a97ce08e387b365f54ee9 GIT binary patch literal 318256 zcmeHQ%Z?=1ac%Diueg|n7Fsk0f`=vuLej_@N;bsM!XjY92EC9CK?{?%hVa6W3{ud> zFfjk1AJC`hlk^Ezcg3mcx>Hq|ac{)Uh^(UpPi1C2PMo+$+1=Aa{{EkT{!gF(;xF!g zclW<9e)amB|NQ#JkMCdn-B+Ky_~`z{SHJoCw_ktx>wo{v*S~uGaQ|mt{`WVp|9tpL=X*9zb~0gDbh>enAO^LzQVty@%H($1m)&M7bpFC&jpzy57Azn5Rzy2h~>P6bXC z5WNJRJ}q-azm{1QiH;~BnhYGNwPCKeU%f`*`_)TH)_GJwR0p75Rs;IA+Q!JYO99a= zU{{gtljzs&cR&Uc6cFtJoX|3&#i(CD4w^rzZiI%cSwI1tQ(ytg=O~W)^;yvTQFYI0 zzDQ;F;13buMYd-zUDL4DYW|06qn-$;lWq>NEWNBu4AB z+Mq7(|3vVKNCoT)z^6cah4UHS=l*<`w|7VT!B?Yo-ZDCTLS5XaU!erCM*;X0*i++N zj`z7g*W>BE(SGpNXq~5=9zLTk?(^Tj1|^6+3c#npo*L(JywCl)9#8L$_JglR>pbQ3 z@ELV+pMHfB#2y9UQ(#Yxb2;AU{#=iz_eT4{SEF^Fa(eiTy0}ljLJ4Az0`MuYr^dM) z?{j~y$J2YG{ot$7I!`%0d`4Z|r(dB2u}1;;6xdVaT#om-KiA{wz0rQHub$Q(`P|m! zpXZFI$=o|p4S-IPjQYn)W3*rED{KEt@oioD^^#J&ze)koDB!B3<)BgjEO$7KnBDKR zJFlp~Bn3qK04KHeZgVSs#hgHI9a+yufw29E@&o$Non9qOpXrlaZ-_a8*Q5D#D+R*# zBRUaqs{sf5hUqiN%R3!&0=?_Ao{s`y`w`^_^r1VwN|-*=C%N7ba{{kN^XXO!gzZOk zBH&g74)zVxXONe7I_3m=*JnK+1;X|t$`9y6cY2jDeWp)xy&>iVUXSL}trQ5`kLX0e ztp*(I8>Y`7FYk2B3G}YddOiw-?MIX!(1-5yDq;FepX7Q&%n7_6&8J%_5VjxDiGW)T zIM_E#pFv)2`Ule9;q@LAn0Eefet<>Ao2J*GKJ@FLfJxJ*oj*|lfJyaOqBW{l`+L4i z{nkEj>(Z}lm8%sudj3{VD;f!0otP_dRIk=oUemR8=~pjhbJZC=f2*evjRdYv%oRAQ zSL-XU>Ds#VtCzC5>WrSh)zgSZ0#_&I3LMp|^_ACjZC(1+OW9m?M$g~sX+$G|s}pkt zj_TF=%4@o|F8%7IY_2+^=Wq2iqLIMWiMawt^=f_PHC5zv9nH@$=zVPI{nj8lgy}wgNbpz@pQ5`4!{S zuTX-RN&!(ez*OYclc8VNYsJXI3W#zC79FLEU%lSjMb|{yqY8)?0FKt%D4%}aNO-oL ztAJ=b;N0$R-RM_0DOo790;1%AnYmu!M!#MWhmZE7faoZ|qLX>~6@UKX^?fGlK~}H2 zs7Jgp1w`is79D@ougGuxH7W<{q7MBEC5T-Lh-Lx1ifo_M`ohWx`Shz*ImS;_K(rfh zYIDD4@LO+1rR?Dth}hiZuNirR zR0sOC=;BD*rhsTT(3YJhntn~Q3KEG_K(sh8vR7bj`ZZ8omd>MqC>3BHW+o=|t4S^f zS5-iiII!p(^lQ<@k+w|%(Qu$GJ5BVcUuO+6`txDT37o~7eQXMZ?MIXc&_*jwbeKNQ z6Vk8Ff+j5&b{<6s1}2?FV*0Gu?BcbhPMoQcw} z&VnW_7j_;+2L>jce~DI@K1+>_&*@jAU`4~j&ZDRxKE^I%dq5_L* z9Hvh)JuoLQ89mZb1;X|t+5o7R)gVltWI|w0U^057qY8xWN3;P@FRMYAKGlRE-+e0} zIv%j-wBCMwR$PywYb5O@1#m8bOYupc)2~SvLt4HHh?WETR-aalew`*Y8*Qn8Xmj9} zTn2j6uLI&SwT=R!RDpGvpD?9gPvqdmUKJ1>3|Mqd`nBldNZY1>XgJW8ohF)oO|uFT ziBv$eI54tTU~T#}P+XSIqkt$CU>;^BCf`zXpoS(s>jRr2@>u%*15W zukG*mzjV^0t!o;J!Bn7CK=cN%eO~06el4;n(i~GjG#xlrZL{nuetlJ3=gk((#s>v( zet<=F@%C%`-&yXJrTu!Ce(hC~d5;Q+>HyTsYT)hHA2{jJ`t>uXy4ChWzD^3@`~aQG z&MxEZ*Eiy%N8{Hw|KL=2w!Z9_L;;*1U=mtZ8uY7GF~*lvK$JDGr(d&df;>VM5UmajZ5CI& zh+pr1Qe5Y_4YKko1#o_VtCI2s(XaV-LPh}!h}HxKw3t!S+pkfazg=ACC|y{0r2;rV zz?Es)!bbgiA2fed-E8`k&m;wKPJxr!dbgopy%lA>lnRKl0;VLnOlqxP@xJ@GnHTrb zseO+7R&489^Y<1X^yg%Ilj-t2qK|-=BU3-2UsJE=NRECzO40S<139DtyNV=8^7iYG zRF~1w_Q;qty^=EtCt4mvle(hC~ zc@GMFBnKUUhpT%3tm4;t%93#=1@Qm>z)ajMT)h2i>3uD>x~|t|zXUe< zOh%7%RDrPlh&BM~Wi<%X$61Kc-*3X4KxcX^=cGW`enbfZo%qf!6Q+;zAoQ!Vph?Sx zok!7ufl23Iq7|mkQe)$D`qd~{(eSYIC@Kh$`h+S#zlO@2ra1k2eHCH)TyJZB`k9zh zP7*LD@Z=#c(kKwNAJGwkX(qN23Dd_$V=;)FdKG8(4FSyDEO=_pe0{ zSQgswDKHB!BMX$SPBHxRruIGPB|}@6?^iEnZ?ul++`ytTc)wr0WT0QYlw_Sp1w?fK>SZ^VozzHoQTF|c% zs{olt$=7GaCKs?K>9V;KFB6W0nxg^pf=HEz5N=+m+#joWpAXe=$yc!W7Dri z7f0GQ1w_Mvwruz4Uf=cRxeu^Vu=oA?zAO|DdR#@$)ySwRP#&Wz8x_wEG@Cm#r4h<#6ZL->(Z}w4J&1|`yM@)?d;Fxa3{l-_9{(Yb&{$E@Plm&J81dOY?+ z8|MdDbh0SFw%>n_%0s`7DtP1hqVofb%2CCyMdc;!EDDGc0cPQ4 zM*#&yIRXn%K1Z>NUq3Id^EvH#Z=wP?KfsBtJzLSQo@%mIS_MRz0n-w_PR!e{-wD!# z@7L>lv_`@b1pI6uImgVV1?7f0GQ1w_Mvw(K;~Mg02a zmxDTEPGFh^kjRMwVf(@P0iJZNbK$K|(D$!@_+n6J%n7Wsq1$2_wjZ1yU{P7U^$GGT z->*f_mzB2nc|7(P{RBKdsP(~HpCG^9{oA0+_2sJ?Ph> z2h2#@`#c`o<6Hud4{CjI*5}Q?2I)b+);d?cY@N^JTN*f*z_(9IeR9?(%CGV-gF0hQ zV5ueD5ZSQ(;QRoKYU->H{aW-KWTEYR9^Z~7`Vsi{NvTiX`b2&I`dM+EOC5?mlEwJ} z7M-z*U-|dxiXN~mwBb`=7G6dky?(uv=k$iSwpBu+!_^l}*Ixw*4{JrP?QeKo_ zTX`?_Z{v7P1s)X;)d8rN)u8v>U&{0PeThr;q3_aj#1W~$qXMEj0QIsO^q%`mc|UN{ z<5GR-yYw7!L@My8fT#{Yy{rbk=l)V2{aWn=+z4OMIe|sTzT~^5aW3W2uS?4D`6dd8 zb_H%iV!Y`k-z``_4@vG3oY1l#|ntf z1}r)%{aSQ!q-|3`G#qHlP7_VPrdb7vL@FR!92nUvur~b~C@xFqQ9zUmFb^{m6K}to znlqRR%u+zK5pY&t=RWkSv!E=OQvp$0z?>`>S<$bH5;0&83WyE?EIJeYT6A%wZBsxr z9B9i<6HULSSp|s%D)3AW#espXLQB)Hq4KhL5(VH>U=mtp8uY7KGA0yM06qm49O02N z{aST%a5;=y_c#=koT??z_|n#9hQDAx;WCdDIgjSv}LD>reD*n zf=jts+ppg*u1BEMEWM=yIG4aJxeWBCUkAivY8?ecsRHXTKVeG0p2)$A zy(%C&7_jJ^^lQ<@k+w|%(Qu$GJ598=U-|DdO|x1Od8&Zu1@P&!LZ9i^Ldzo6eg#BR zf&G;>&7@y9m7mRLDInSiIIFL7pDKR+VR4!}~ssf_Kfko#i;@7)Z#dR)v zJoZBy=LcAHGWxaX;z--3fM__-mYpWLieG;;s59mSrda@qoG1{sADkcHN!L0TVfy$= z@a7)}b;g`PUwW+Or9jwzaDISZgeTYW*5|z-Jw6|-i#dUlWoEmT3WV(k=LfiwPqr{` zeS-Y@=9hyyV@_bUJ&?~qfw2AH`~Zsz?yXOdU;psMpw5^RSae;xryaH*oF8COCB5~b zUyB}uB(%NH<8ds}Pr&1YS|7ai3G(X~Uk&PvIf1pdbUSRr_Ji{SEGntDKJ;tRgOG%_ z_jx>yCHe_?d{FCyw?09By?b3;=US&?hiq|vfJH|v;@A8CD6VtSl~#E>#kG)=LfhlEn8R-zy5Lb_a(CFPCk!emdlpQcN z*~{eU*UQ@Q$&M5dodsBQv{AoC{r5N8x_?t#=c31BKeTavfJG;xUyCk|v~3EAh68Qc zX`-w6HH}~-Qd0p@+Q6Dq+ zreD*nf=js>ehn0trSm8tN(GpQnTZMgYLbh=RTU5=4lFu{w_m;ML00~J z7)lVk6cEh_3MGhL3W#O_yNYa|DPFT*&%@fq8xw;DD_Y{`qgQUsP`+s9`4yLbB+q& zd;^aUYJIpBzhX{ctu5US+pztJY6vVU=}{l2Ilk*urO#kDcnYgV3+7BKy2Po=5NTL<<7F>rbk8)W>O#D8K&wAV16roFp&X7!?TH z56%zJ$SX_uQ6Hx{=+`Wpt;yqf9=++~d;`7fPpWs+$7zoC-}CJ?2H&qrbk8)W>NK`gPLccG~869=+$o`38E|pH$CVpD4e+9MlAKHbw=)_Ji{S zH1f(4?yXN&zh=>!JT_85v@dWY8l!E!{W{9$QvT@oub1lb{*5RgIw!E`*z{}B#gVp6 z0nu=vEjvwg6~B)DzC;?qixBxxiV$5(6Y~Jq<|T`xUz05)37x3lE8T27(ayllh>SI+ zU&r*@Yc~CwbF)?an(gOJ?@P{odaK}vm?$EF`eG_^lPtj%qyjUC|h7D@^8sj z@#|a0{6qyh6cEh;c9h7F;_cTAGm!%oct-)zJHU4WYTT&e*D5?~Z2bmVip~s7A_2~| z6+!<}9-dqAJ<$xsOZD6P=+oZE_0sb;4)H)d@5QA$OV6!+z3VOYpVbe2fZ)|$<9;t5 V|2$bx`GCFkkcVoxZt>mc{|^{f0q_6- literal 0 HcmV?d00001 diff --git a/data/Linux/mal_matrix.npy b/data/Linux/mal_matrix.npy new file mode 100644 index 0000000000000000000000000000000000000000..fbcc8f287ef998454b4b8b8df41c795eb43bc1a0 GIT binary patch literal 695696 zcmeI*%dRCyR>tweti-4WHSoZL(w1cxrIO`K`$FT!tJP|^1Ol1?14hWA-C}@fWFsaB zsosGX^&?~s6SzyGJd`~5%v*~5Q7{Lg!T z_~yHR{r0_IK7a3vKYsq+tLN|i@lW6W*SBB)$G`mP+dq8s``QiL`|MvBN ze)Gfei;qA4?A7y+KKcAFUOoTc=d1qv^M{}Q`~J%*KC|0nomWg^b1150~5dw(2leCmOvPblpX zKmY**mI$n^2Xe<4ZQnjB->vmP?!0o>m39aqfB*u02~ZDA>zTV>p4ESlE6gwERUQ{z$(ocdTWI06VD(7k~6q3#oHJwcli(0UyeTW4ba2q1s}0_y~{ zudKV9yNU_eG?Lcqs8}o?20{P<1Q0*~f#L$W^+WNAF(3j6AbuQLS+SuQXp2Chb*gROKYKo<{*F{C&n=G*_acA*0yzTI135|+w6O~2 z@WvEU`&Td5YpDlvJ>OU&{)qqr2rLqKYKxlF_$+}=)+u*itECVdPK6vjmW zfzkrh1Es}Rj%O?HtL(N~_W9g-P!Cl005Kf|5I9WWi5=E89q%c?I^AgI?&n+EQ4id( zbASK>9ScwoG_ttX$UF;8bL&2zdq4F+p#^0a1Q0*~0R#|83S`$W$)0F{00IagP=f&P z$JDS2{my~9q2IM7GXxOmOMrTyul1}g8|s1DR)qN=fB*vb1*iw^yKr_N0qTK$9O(rC z1Q0k{fO_C);j#Ot9vFN5$sPd&+7X~0XlJcjoGJA{i>t<*5kO!>0qTJfSG`=$)C1+L z83_Rd5ZEF>J+MWOds70`11TY{BY*$`bqG)o)Ugh|&VqWN*OetN1Q0-=jDYr$cr@>P z+>Yb61hlSa#kRDVEdmH2Fphxsg*apHeB6%Xwgj}UXT`R(m@NVbAh5rHUVr!Rd0UU( zyu9<0r`9e?Zn%hYI%00Ic?C!p7_{d(M|CvRTf`F%#Em$L-4u3KYjzfWD?I{w!C&+=zn z1Q0+#uV1xEc=Phk*EWCVvrItix;3Wu`_%QV<8QrxnHsGTKmdXL1oZl~Uyu9rHk2=pwV*OO8?mL9KEUW_xQK`RYVDlf(v zQy}#`d)IZo_BiIMk~IQj3h4D{%v|8tto;fWLSTCUGUNO~^+Lemb1F03^`kn&R1AA(w`_k)3x@FlG zwa%-o0;7#4pyysOmA~4Riqr$C72*1x0w36(z0p@nK(8aIcBQY^Ib{gG*Xt6X9w@sWwSJt`IVbaO zmdp@90D&z6OZRc=fi2qS-%CAk{u(d?1Q0-=Mgi)98dv4GIcdF*8-bn?KmY**Mi98z zZI;?!md4v&I4UKebt*L;;~;00OlMyk)h`XxV)9dgR3? zjpw&agVqR?70}0B*-;q{0R)Z^pdL6vl|B&|Lx6fvMBoSk>VYFv z=@Wr-1gHnjac5KndJv!<=wTJf0f9+?dVnVZ0R+|xP!Ft?<30rH6L@9y&1uPu^m^pQ z=Z)vL#Dca6Ab`^rNpQtnP>#5kLR|1Q6(0fO??g#XTT3)m5qZ%%;KB?trP#B?1T_ zfWVFd+4X>hK^1qL!`-fS8Yg~#K>Jf1@6Lgfj|5^j@0fbv5gjKGKmdWN1*iwAUY_bE z=k5RTdFp}cmW~M_fB*us0@MSuJY0+fwD0ThLg+Y-aXk=U-|IM<9)}1ZfB*t31+stt zfQ11RSB{r`+fL)?=M7W)Pxib9ZzxCVfx#aH(nkP+?gXd@x?9pi6QUkCl$XvCKmdW$ z1gHm2qi=HH+Ufz#XKie95tuXr2q18#fQ7*o&z#GKvAum`LzhF{D!*F3UZDA`jYBOS z$s&LN0(%Hp=tOakK2PYw+c!>_O4~!)vgv#7XP;k|1+5W4009ILK;TBeZjO=%XYc#o z_{oXW009ILKmY**d;#+>-MixL$7i3vH&@OffB*srAb>zy0$b~ewz6fm2q1s}0tg^* zPk?&ho(ks>KmY**5Ex5fMAT2J9oG>+009K*65#cF;dkxE{8!edW!y{<$%gj`~Tp z<2nKeAb>z!0=#}NTn}5=zVhZ;|J;~LNByMQaUB5!5I~?V0baisu7@pbUwLz_e{M{r zqkdBDxQ+k<2p~|G0I%N**TWXJue`a|KR2e*Q9r46Tt@%_1Q4i8fYoJO#^4uDi z>Czej1Q0*~fi?uRKejR3{b#BCy{Hkr>r2n8KOzDMAb zi<*mfz0~;!n`W~2cyWF@CL(|U0tk#Hp#5d6tbfi{`Fl|{ars}=RYB&mshi3pq_py%?8v6~pvnwVublcXNlcPYJ( zZ=K&7sRyR@N;$@>>a~yCag0Do0X>%`$7C!75a?SVeqGUV9HpT1-m%WN5|xi)Ja3(k zW898o1WF3%c`P|*OJm_vcC~8t_U7fCFRRwhXt`dw?b^xw{!Eo;sd#@DoJC*~&~u2# zeF1##yBu}4R&Q@!-ua_^TJJa4E4N)Q9?JLV*t!?4? z_xFC^2iALr^Z6BjK5F%L=2hHhez~f@n(LL@uJ(kJd4%Um?P`nKZxy}y>ADxa`?j9f zcw74^b^lvoy=#4LR?(Z6u6xnDZ|ixDx3r&&O^@T!`?j|K zw9vS3hjFGX`Tt%5)B}4-X7*#5?Fu)e9w-dn$#B{~ymaF0I<6&Zzg6_+rR!ew?%R4^ z<1OvyGn-aq>3v(*KoYs2QVKMjM(S5DBE7 z=hXG6yUm9hFE?txRrK!DbuW7NZ9T8?mi9wEu=!CMvV-R~WZEPh3-BDoA{-;otbpFP zUz@_G+h?dE4q{JB<_X=iae5&$vDH zfG5Ry1Q0*~fu;p4bM|Vdn0edkW;=}&@8{mJH_x~|^?)bEc?1wZ0D-0jEOYj1rSjBQ6YuBVu{Y1SJ@tSm#d!n}KmdWJ1uS#+YNwca+v;XJjT7(Z-my2&xIOiNC&hUL z5I_KdrUfi>_G+h?dE4q{JB<_X=iae5&$vDHfG5Ry1Q0*~fguDeWU1I_(((ST*5Po6( zL@NXkKmY**dKRD_=y`R?4FLoYKmdU}0qTJ~Ct4wZ00Iag z(6fMk|G<~lbC%?W00IagfB*t>0@MR@I$T2l0R#}}T|mFLN3r*Oz5G-U{L;J@oJRlw z1Q0-=Ujfwtiv4CfpV{d3fYx7eUW=;;Abgb=!Q?aAe zNCg1|5I`V9;QIP+8I2oia?Q(`v-WrA@`LVIRQq1*0ndT+2q1s}0%HhhzvKOZ`|B+q z>Vf+{oJ9Zu1Q6&%z(O91ous9G+)L-_!hF^K*Wtcv|M9UmAmY3 z>w>vm_iN`K?eEUzN2k@Edf*WaClEjY0R(ywu#kje4>|piL#iHFo93^A)3yKjcx}hr zhX4WyAb`M`0u}~WJaaBPj(y#)ckJ`WuBZog2BkX$5I_KdIs`0CL$QuoZJxzZ^#ETl z*t|4!fB*srAW)-#g-I#aIL8%pq8?a*#w`dSfB*vH3Rp;8u@edEfld~ZG!Q@l0R+kj zSQtjJ%&3QtLOpOeF#{lg00IagFsuN-pK#cvCvgN2KmY**t^(8pR|@`t00IagfIznb zsr~*#n@|0%#P#{RxrzV+2q1s}0yPLw57e*<%mD!e5I_Kda|KfMzzdt7AMX#$`*0Nj z1Q0*~fffX)2U=Jq=7<0S2q1vKJ_6q7@m@Wk_J3*fOAiclvy7}K>z^+5I~@WfcJfY&+Y!`YX6aQ{r+8?MF0T= z5I_Kdngz7K$8)3}h($O?009ILK%ifNPwakf-q#C$5!&#40w37@*?IMV*ZzJloJ9Zu z1Q0*~fffY3{hfNCh1Fw@2q1s}0toCQ;O+ZM&+juEdO-jI1Q0*~fincWpCfo`_h;j^ zzu%g(2q1s}0tg_`f`GTbd%u6cYu`ey%n<0*fivR)$ zAb524y!qtpI+oqr&J00Iag(6E4hzD}{>RBoqkw}^-urwey>s7 zj$;H6KmY**Y7kHzpjgA4cA10C_d-1|@55CD5I_Kdu>|yatYRu1d^b9;dO*LIfO_E8iz5UO zKmdV01oU}yckgT4?E1X+zJEabzwXxYCG_{#A?msp`_r~8>-+Es9yxhxU?fV1X?F&n*5<=~2q1s}0(}W+f9fkMKbwpB*GgWY9w@2JSO_4100Jq2(dvQt z{gAW1jvvn>wN6||009ILK;TFL>VYG5{l2LOd>gJHfB*srloy~LC~yAbh|~k;xHBpO z2q1s}0zV2ov0DdyKjH1!Ab(Hy^5%?)00IagfI#g6Ykv=M?Rhaj1Q0*~ z0R#}p7oZ-VeT#oO}^L009ILcq~9Y z@YsYi2q1s}0^no@-hQuHc-N^1Bt%320R#{jO29(4ik&3y?e|L0yG}hIAtC|@ zAb`M70v57W>?C<_zgK$Rb?N~L5fMND0R%=8(BJn}>?U`c&*V`(5ZYgy`yREQ@1Cm& zAb?J|!fnHXUJP<$t z0R&12s18spF=mM|s0T_YGX??(Ab>zO0v0k+>?Eg>VaCG9Oi)l0tgH$ zU?FeCP8d@4!1E~IxIQS&5YMOd_>6}D0tg^5xPXQH6+7Wb)dTND`F-8JY z$7{!V1Q0*~0R#>au+XdGA%m3fG_^0s`!DrC`88rh1Q0*~frBK>Vdht+{fB##XGXHgm@xB7+ z^ZNe5(EfFJf8gOCY&sL~i>dxJ?$)TfO!bW7)_zm>D__M_d#)pZ00Ic~C7?P$v6t*p z`#1GKFDp$R2q1s}0(%JP^H0S+`mCZ4o1aBo5B$y4{&k3tZcfz~k8W6bLhlbKemKqR z;S%D?Zn+Hs1Q0*~fms370gAJYTe@iTlfB$tzt#c2xjldFq}2Q9ssj|M2d+!NKM+6w z0R#>dP#vIn;810|w)x?---cGNe;e9+^}u{@`(4$%6sZUHv!y2l5I_Kd837B|6la>Y zbSd@uy!P8`P9gvIL%g0Z{144bk$Rx;0x%o`2p}-3fQ7^rJ8_^M=wu;D0|5jOK%k6( zg<%xSjC%Me)B}eTGXMezAbrNj!`;)#zO!B1Q0-=CIRY! znpT6kAbVaCGNale60tg^*h5+@zq)t6>#?!}`2q1s}0<{QG4@@f&*8>k$%L*|M1Q0*~0R)a0 zpdOfvsRxdK)))Z+1Q0-=UV*o*-no^W*;&7TsAP_eg#ZEwAb+jsOA(Ab>#s0@MTjuQ3@SfB*srAdnKE9!Lps z9RUOoK%h$jo3eWJb9Rj<`~8>e5_t4;7Qts-Z`_3d0tg_0KqCS+&9k@9i}$Jro(7*r zR+DKWfB*srAh1rrrrUb^d%8>Of$P2>JcM~YuMs}0tg_`wSZ08J^H$N<4G@lf8ev=5dXb`-dq}AR;G^t0tg^*lz>eg_qOj*4;=Nt z&@TcAAb>!Z0ybsU+wbk)s~+ghrORiSq!2&=0R*-S*wjUD`yTbc_D6#r5I_I{1jZBS z?dJ`buHg0i2cjE$`##GkJq{5-009ILxDn{>_YvJBmvP|L1N?kJ1|CfiKmY**x)B(C z-}~66rHt#M(OtTEz)1uF1Q0+VOCa6{vu^6JQS7kv`w%)LAteM5KmdV(1m3lQlATHQ zoyg%59naj#B?J&a009J=7oZ-PmXLa&`A3-q5I_I{1U3l7`{0JIn!Yu5=?8ri`cuCI{+)?w)WG zLI42-5Xcd*sS)+S^id|hwfPni08LA>v4UG;aJbboE%Kj6(< z^Cu#J00IaguttD-;8tz)dO&s1tqMm7Ab6@0R-j+R0k-|w{GWZYF|t3tEv0Xl1R;a>%3~sduv|Yj{pJ)ATTeWIzVy0 zbvsv6`&w#WP2GQ%L~7n^=cRlV*Y?AG2q1s}0*wk-n73l1X~fe`?Q2`F$K#bc*1Y%r zJIv}=u~a^cg8%{uAW(^bg?T7eGOexCNbPG|uWy~-+WR$c>VdU_+=l=H2q17*z`|L@ zyN%jAseRL%`PS=Rzek_fyg##Pq`iI~QvE6(od^9RfB*sr)Fq(Tf5p0Hw|+LgzKWySIGe{8SHoV)2!A$Gv;4Q{gTI5I|rY z0qp~dy=2~7KJmIx55yuIBY*$`2y7A1&lxIi>8q-HZQX2IaZo*=cv+dRlzOk9lqV4Z z1Q0;rzJT^W#rv%qJZtOb+=_$h0L9D7eAg*)7Xk<%fIxQw7BW)oB*C?x@3vA8d>Gb? zdZ3dBmNXDR0D-dwEKET0?D_08KJ~y(kaU9p0tg^5hJb}M6+6kg_j*9f^nO^kONcs8 zL;wK<5I|s&fQ8nIi*ByB@hJ6x{yy`o&epHqCySXOfB*u03S7TlZhhsqwjS8)_xlvn z4C|)vBPcd510jF_0tj>^Ks|7~oYVui9vmTn00IaMB48m;#ZJ;ry+82G$@ism{;6}m z7B0SCP|Hd%4+IcEpiuz}^Hyv$jdw%EJR}XmgLp<(T$J7I7$ulkj2q4gjfQ391 zJ4uUrpp%6p4FnKC0D&?B7KTwQGwR`^EUgFh{doF4Kk)>4e;^j&7y$$jK%grD3)v`k zk{tCwCksg$2q1s}0!0M$eR7IL1}!m&%?}cNzW@5G(EcOmzNr0t=OUVV5_-n#rdoBkD_f6FOc-K11c40oL_Xk?h(Yh+O zGH>RI00IagP*y?|KmdWOfL;G}^AFVliVvaPCB&P?6(6L2e&Cv4W)5C^-XF*Wq!|JTAb>!f0#@g= z_L^0y9@uLf`au8z1Q0-=X#s0`_Q}cX>m133G(Z3W1Q0-AV1eBIQtcF5AxqBJ<}-=z Sw|@-nqx18fG;lSY&;JAcz$8@w literal 0 HcmV?d00001 From 1abcae4679cd4a09e007599bf6b7fab637ab864f Mon Sep 17 00:00:00 2001 From: wapiti08 Date: Fri, 27 Sep 2019 15:52:59 +0100 Subject: [PATCH 2/8] Add files via upload --- loglizer/dataloader.py | 212 +++++++++++++++++++++++++++++++++++++++-- loglizer/matrixgen.py | 208 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 410 insertions(+), 10 deletions(-) create mode 100644 loglizer/matrixgen.py diff --git a/loglizer/dataloader.py b/loglizer/dataloader.py index 574617e..3f203e2 100644 --- a/loglizer/dataloader.py +++ b/loglizer/dataloader.py @@ -38,8 +38,9 @@ def _split_data(x_data, y_data=None, train_ratio=0, split_type='uniform'): else: y_train = y_data[0:num_train] y_test = y_data[num_train:] - # Random shuffle - indexes = shuffle(np.arange(x_train.shape[0])) + + # fixed shuffle ---- in order to have the same result + indexes = shuffle(np.arange(x_train.shape[0]), random_state=7) x_train = x_train[indexes] if y_train is not None: y_train = y_train[indexes] @@ -140,6 +141,7 @@ def load_BGL(log_file, label_file=None, window='sliding', time_interval=60, step """ + def bgl_preprocess_data(para, raw_data, event_mapping_data): """ split logs into sliding windows, built an event count matrix and get the corresponding label @@ -159,20 +161,31 @@ def bgl_preprocess_data(para, raw_data, event_mapping_data): if not os.path.exists(para['save_path']): os.mkdir(para['save_path']) log_size = raw_data.shape[0] - sliding_file_path = para['save_path']+'sliding_'+str(para['window_size'])+'h_'+str(para['step_size'])+'h.csv' + sliding_file_path = para['save_path']+'_sliding_'+str(para['window_size'])+'h_'+str(para['step_size'])+'h.csv' #=============divide into sliding windows=========# start_end_index_list = [] # list of tuples, tuple contains two number, which represent the start and end of sliding time window - label_data, time_data = raw_data[:,0], raw_data[:, 1] + # get the list of label data and the list of time data + label_data, time_data = raw_data[:,0], raw_data[:,1] if not os.path.exists(sliding_file_path): # split into sliding window + # get the first value in the time_data list start_time = time_data[0] + print("the start_time is:",start_time) + print("the type of time is:",type(start_time)) + # the index points at the index in the time_data list start_index = 0 end_index = 0 # get the first start, end index, end time for cur_time in time_data: - if cur_time < start_time + para['window_size']*3600: + # the start_time + para['window_size']: + ## start_time is the first value in the time_data list + ## get the data scope using the window size + ## cur_time < the result means it is in the scope of window size + print("the current time is:",cur_time) + # if cur_time < start_time + para['window_size']*3600: + if int(cur_time) < int(start_time) + para['window_size'] * 3600: end_index += 1 end_time = cur_time else: @@ -181,15 +194,19 @@ def bgl_preprocess_data(para, raw_data, event_mapping_data): break # move the start and end index until next sliding window while end_index < log_size: - start_time = start_time + para['step_size']*3600 - end_time = end_time + para['step_size']*3600 + # start_time = start_time + para['step_size']*3600 + # end_time = end_time + para['step_size']*3600 + start_time = int(start_time) + para['step_size']*3600 + end_time = int(end_time) + para['step_size']*3600 for i in range(start_index,end_index): - if time_data[i] < start_time: + # if time_data[i] < start_time: + if int(time_data[i]) < start_time: i+=1 else: break for j in range(end_index, log_size): - if time_data[j] < end_time: + # if time_data[j] < end_time: + if int(time_data[j]) < end_time: j+=1 else: break @@ -199,7 +216,7 @@ def bgl_preprocess_data(para, raw_data, event_mapping_data): start_end_index_list.append(start_end_pair) inst_number = len(start_end_index_list) print('there are %d instances (sliding windows) in this dataset\n'%inst_number) - np.savetxt(sliding_file_path,start_end_index_list,delimiter=',',fmt='%d') + np.savetxt(sliding_file_path, start_end_index_list, delimiter=',', fmt='%d') else: print('Loading start_end_index_list from file') start_end_index_list = pd.read_csv(sliding_file_path, header=None).values @@ -218,16 +235,24 @@ def bgl_preprocess_data(para, raw_data, event_mapping_data): expanded_indexes_list[i].append(l) event_mapping_data = [row[0] for row in event_mapping_data] + print("the event_mapping_data is:", event_mapping_data) event_num = len(list(set(event_mapping_data))) print('There are %d log events'%event_num) #=============get labels and event count of each sliding window =========# labels = [] + # inst_number --- row, every row is a log sequence(windows sliding) + # event_num --- column, every column is a event, the number is the occurrence of a corresponding event event_count_matrix = np.zeros((inst_number,event_num)) for j in range(inst_number): label = 0 #0 represent success, 1 represent failure for k in expanded_indexes_list[j]: + print("the length of expanded_indexes_list is:",len(expanded_indexes_list[j])) + print("the k value is:",k) event_index = event_mapping_data[k] + print("the event_index is:", event_index) + # the index is not different from the eventId + event_index = event_index-1 event_count_matrix[j, event_index] += 1 if label_data[k]: label = 1 @@ -237,3 +262,170 @@ def bgl_preprocess_data(para, raw_data, event_mapping_data): print("Among all instances, %d are anomalies"%sum(labels)) assert event_count_matrix.shape[0] == len(labels) return event_count_matrix, labels + + + +# this is a part of test for bgl_preprocess_data function +# import os +# import pandas as pd +# import numpy as np +# from collections import Counter +# +# para = {} +# para['save_path'] = '../../logparser-master/logs/BGL/BGL_2k.log_matrix' +# para['window_size'] = 24 # 24 hours ---- one day +# para['step_size'] = 3 # 3 hours +# +# # list data, the element is tuple of (label, time) +# +# # System log Detection/Anomaly_Detection_Time.ipynb +# df_raw_data = pd.read_csv('../../logparser-master/logs/BGL/BGL_2k.log_structured.csv') +# raw_data = [] +# for label, time in zip(df_raw_data['Label'],df_raw_data['Timestamp']): +# raw_data.append((label, time)) +# # raw_data +# raw_data = np.array(raw_data) +# +# df_map_event = pd.read_csv('../../logparser-master/logs/BGL/BGL_2k.log_structured.csv') +# event_mapping_data = [] +# ids = [] +# ids = [int(x[1:]) for x in df_map_event['EventId']] +# +# for id, log in zip(ids, df_map_event['EventTemplate']): +# event_mapping_data.append([id,log]) +# +# +# event_count_matrix, labels = bgl_preprocess_data(para, raw_data, event_mapping_data) +# print("the event_count_matrix is:", Counter(event_count_matrix[9])) +# print("the labels are:", Counter(labels)) + + +def load_Linux(log_file, label_file=None, window ='sliding', time_interval = None,stepping_size = None, train_ratio = 0.5, split_type = 'sequential', save_csv=False): + + print('========== Input data summary==========') + if log_file.endswith('.npy'): + # split training and validation set in a class-uniform way + assert window == 'sliding','Only window=session is supported for Linux dataset' + + data_df = np.load(log_file) + if label_file is None: + if split_type == 'uniform': + split_type = 'sequential','Warning: Only split type=sequential is supported' + # split training and validation set sequentially + x_data = data_df + (x_train,_),(x_test,_) = _split_data(x_data, train_ratio = train_ratio, split_type = split_type) + print('Total: {} instances, train: {} instances, test: {} instances'.format(x_data.shape[0], x_train.shape[0], x_test.shape[0])) + + return (x_train, None), (x_test, None) + else: + raise NotImplementedError('load_Linux() only support npy files') + +# this is a part of test for linux_preprocess_data function --- get the event matrix + + +def Linux_preprocess_data(para, raw_data, event_mapping_data): + """ + split logs into sliding windows, built an event count matrix and get the corresponding label + + Args: + -------- + para: the parameters dictionary + raw_data: list of (Time) --- we will transfer the time to seconds, and get the abs + event_mapping_data: a list of event index, where each row index indicates a corresponding log + + Returns: + -------- + event_count_matrix: event count matrix, where each row is an instance (log sequence vector) + """ + + # create the directory for saving the sliding windows (start_index, end_index), which can be directly loaded in future running + if not os.path.exists(para['save_path']): + os.mkdir(para['save_path']) + log_size = raw_data.shape[0] + sliding_file_path = para['save_path']+'_sliding_'+str(para['window_size'])+'h_'+str(para['step_size'])+'h.csv' + print("the sliding_file_path is:", sliding_file_path) + + # ============= divide into sliding windows ============ + + start_end_index_list = [] # list of tuples, tuple contains two number, which represent the start and end of sliding time window + # get the list of label data and the list of time data + time_data = raw_data + + if not os.path.exists(sliding_file_path): + start_time = time_data[0] + start_index = 0 + end_index = 0 + # finish the comparision in one roll with window_size + for cur_time in time_data: + if cur_time < start_time + para['window_size'] * 3600: + end_index += 1 + end_time = cur_time + else: + start_end_pair = tuple((start_index, end_index)) + start_end_index_list.append(start_end_pair) + break + # sliding the block and change the index of start and end + while end_index < log_size: + # add the sliding size to start time + start_time = start_time + para['step_size']*3600 + end_time = end_time + para['step_size']*3600 + for i in range(start_index, end_index): + if time_data[i] < start_time: + i += 1 + else: + break + for j in range(end_index, log_size): + if time_data[j] < end_time: + j += 1 + else: + break + start_index = i + end_index = j + # update the start_end_pair + start_end_pair = tuple((start_index, end_index)) + start_end_index_list.append(start_end_pair) + # compute how many sequence(lines) in total + inst_number = len(start_end_index_list) + print("there are %d instances (sliding windows) in this dataset"%(inst_number)) + np.savetxt(sliding_file_path, start_end_index_list, delimiter=',', fmt='%d') + else: + print("Loading start_end_index_list from file") + start_end_index_list = pd.read_csv(sliding_file_path, header = None).values + inst_number = len(start_end_index_list) + print("there are %d instances (sliding windows) in this dataset"%(inst_number)) + + # get all the log indexes in each time window by ranging from start_index to end_index + # in order to counter + expanded_indexes_list = [] + for t in range(inst_number): + # for every row(sequence), there should be a index_list + index_list = [] + expanded_indexes_list.append(index_list) + for i in range(inst_number): + # get the index_list for every row + start_index = start_end_index_list[i][0] + end_index = start_end_index_list[i][1] + # add the indexes for a sequence to expanded_indexed_list + for l in range(start_index, end_index): + expanded_indexes_list[i].append(l) + + event_mapping_data = [row[0] for row in event_mapping_data] + # get the total number for events + event_num = len(list(set(event_mapping_data))) + print("the event number is:", event_num) + + # ============ get event count of each sliding window ============= + event_count_matrix = np.zeros((inst_number, event_num)) + for j in range(inst_number): + for k in expanded_indexes_list[j]: + event_index = event_mapping_data[k] + # make the eventId suitable for list index + event_index = event_index - 1 + event_count_matrix[j, event_index] += 1 + + return event_count_matrix + + + + + diff --git a/loglizer/matrixgen.py b/loglizer/matrixgen.py new file mode 100644 index 0000000..a572a62 --- /dev/null +++ b/loglizer/matrixgen.py @@ -0,0 +1,208 @@ +import os +import pandas as pd +import numpy as np +from collections import Counter +import re +from dataloader import * +import joblib + +# function to transform hours and minutes to seconds +def trans_seconds(time_list): + seconds_list = [] + seconds = 0 + for i in range(len(time_list)): + # print("splitting time:",time_list[i]) + seconds = int(time_list[i][0]) * 3600 + int(time_list[i][1]) * 60 + int(time_list[i][2]) + seconds_list.append(seconds) + return seconds_list + +# transformation between month name to numbers +def month_string_to_number(string): + m = { + 'Jan': 1, + 'Feb': 2, + 'Mar': 3, + 'Apr': 4, + 'May': 5, + 'Jun': 6, + 'Jul': 7, + 'Aug': 8, + 'Sep': 9, + 'Oct': 10, + 'Nov': 11, + 'Dec': 12 + } + s = string.strip()[:3] + + try: + out = m[s] + return out + except: + pattern = '<.*>(.*)' + match = re.match(pattern,string) + s = match.group(1) + out = m[s] + return out + # process the special case with Jun + # raise ValueError('Not a month') + +# transform month, day to seconds +def trans_seconds(month_list, day_list, time_list): + seconds_list = [] + seconds = 0 + for i in range(len(day_list)): + # we assume there are 30 days for every month + seconds = (int(month_list[i]) - int(month_list[0])) * 30 * 24 * 3600 + (int(day_list[i]) - int(day_list[0])) * 24 * 3600 + \ + int(time_list[i][0]) * 3600 + int(time_list[i][1]) * 60 + int(time_list[i][2]) + # print("the seconds are:", seconds) + seconds_list.append(seconds) + return seconds_list + +# transform log key to eventID +# def Event_Convert(fd): +# event_map = {} +# for i, event in enumerate(fd['EventId']): +# event_map['E' + str(i+1)] = event +# +# return event_map +def Event_Convert(fd, filename): + event_map = {} + event_list = None + event_list = fd['EventId'] + # get the unique values in a list + event_list = list(set(event_list)) + for i, event in enumerate(event_list): + event_map[str(i+1)] = event + joblib.dump(event_map, filename) + return event_map + + +if __name__ == "__main__": + + # define the window_size and step_size to get time sequence + para = {} + para['save_path'] = '../../Dataset_ML/Linux' + para['window_size'] = 24 # 24 hours ---- one day + para['step_size'] = 3 # 3 hours + + # =============================== generate the event matrix for normal linux logs ========================= + # get the linux dataframe + fd_linux = pd.read_csv('../../Dataset_ML/Linux_2k.log_structured.csv') + # make a copy to avoid modifying the original data + fd_linux = fd_linux.copy() + + filename = '../../Dataset_ML/Linux_matrix/Event_dict.pkl' + # check whether the event_dict has existed + if os.path.isfile(filename): + event_map = joblib.load(filename) + else: + event_map = Event_Convert(fd_linux, filename) + + for i in range(len(fd_linux['EventId'])): + for key, value in event_map.items(): + fd_linux.is_copy = False + if fd_linux['EventId'][i] == value: + fd_linux['EventId'][i] = key + + fd_linux.to_csv('../../Dataset_ML/Linux_2k.log_structured_id.csv', index=0) + + fd_linux_id = pd.read_csv('../../Dataset_ML/Linux_2k.log_structured_id.csv') + fd_linux_id = fd_linux_id.copy() + + # part to transform the month, date, time into seconds + month_list, time_list, day_list, day_list = [], [], [],[] + + for i in range(len(fd_linux_id['Time'])): + time_list.append(fd_linux_id['Time'][i].split(':')) + for j in range(len(fd_linux_id['Date'])): + day_list.append(fd_linux_id['Date'][j]) + + month_number = 0 + for k in range(len(fd_linux_id['Month'])): + # print("we are transferring the month:",fd_linux['Month'][k]) + month_number = month_string_to_number(fd_linux_id['Month'][k]) + month_list.append(month_number) + + seconds_list = trans_seconds(month_list, day_list, time_list) + + raw_data = np.array(seconds_list) + + event_mapping_data = [] + Event_ids = [] + # get the digits part of eventID + Event_ids = [int(x) for x in fd_linux_id['EventId']] + + for id, log in zip(Event_ids, fd_linux_id['EventTemplate']): + event_mapping_data.append([id, log]) + + + # create the event count matrix with the function of Linux_preprocess_data + event_count_matrix = Linux_preprocess_data(para, raw_data, event_mapping_data) + # print("the event_count_matrix is:", Counter(event_count_matrix[9])) + print("the event_count_matrix is:", event_count_matrix) + matrix = '../../Dataset_ML/Linux_matrix/log_matrix.npy' + np.save(matrix, event_count_matrix) + # np.load(matrix+'.npy') + + + # =============================== generate the event matrix for malicious linux logs ========================= + + para_mal = {} + para_mal['save_path'] = '../../Dataset_ML/Linux_mal' + para_mal['window_size'] = 24 # 24 hours ---- one day + para_mal['step_size'] = 3 # 3 hours + + fd_linux_mali = pd.read_csv('../../Dataset_ML/malicious_linux.log_structured.csv') + fd_linux_mali = fd_linux_mali.copy() + + filename_mali = '../../Dataset_ML/Linux_mal_matrix/Event_mal_dict.pkl' + # check whether the event_dict has existed + if os.path.isfile(filename_mali): + event_map_mal = joblib.load(filename_mali) + else: + event_map_mal = Event_Convert(fd_linux_mali, filename_mali) + + for i in range(len(fd_linux_mali['EventId'])): + for key, value in event_map_mal.items(): + fd_linux_mali.is_copy = False + if fd_linux_mali['EventId'][i] == value: + fd_linux_mali['EventId'][i] = key + + fd_linux_mali.to_csv('../../Dataset_ML/malicious_linux.log_structured_id.csv', index=0) + + fd_linux_mali_id = pd.read_csv('../../Dataset_ML/malicious_linux.log_structured_id.csv') + fd_linux_mali_id = fd_linux_mali_id.copy() + + # part to transform date time into seconds + month_list_mal ,time_list_mal, day_list_mal, day_list_mal = [],[],[], [] + + for i in range(len(fd_linux_mali_id['Time'])): + time_list_mal.append(fd_linux_mali_id['Time'][i].split(':')) + for j in range(len(fd_linux_mali_id['Date'])): + day_list_mal.append(fd_linux_mali_id['Date'][j]) + + month_number_mal = 0 + for k in range(len(fd_linux_mali_id['Month'])): + # print("we are transferring the month:",fd_linux['Month'][k]) + month_number_mal = month_string_to_number(fd_linux_mali_id['Month'][k]) + month_list_mal.append(month_number_mal) + + seconds_list_mal = trans_seconds(month_list_mal, day_list_mal, time_list_mal) + + raw_data_mal = np.array(seconds_list_mal) + + event_mapping_data_mal = [] + Event_ids_mal = [] + # get the digits part of eventID + Event_ids_mal = [int(x) for x in fd_linux_mali_id['EventId']] + + for id, log in zip(Event_ids_mal, fd_linux_mali_id['EventTemplate']): + event_mapping_data_mal.append([id, log]) + + + event_count_matrix_mal = Linux_preprocess_data(para_mal, raw_data_mal, event_mapping_data_mal) + # print("the event_count_matrix is:", Counter(event_count_matrix[9])) + print("the event_count_matrix is:", event_count_matrix_mal) + mal_matrix = '../../Dataset_ML/Linux_mal_matrix/mal_matrix.npy' + np.save(mal_matrix, event_count_matrix_mal) + # np.load(mal_matrix) \ No newline at end of file From 1a38b95660bec17efe4f02388798023c538b8c71 Mon Sep 17 00:00:00 2001 From: wapiti08 Date: Fri, 27 Sep 2019 15:53:28 +0100 Subject: [PATCH 3/8] Add files via upload --- demo/PCA_demo_without_labels.py | 130 ++++++++++++++++++++++++++------ 1 file changed, 107 insertions(+), 23 deletions(-) diff --git a/demo/PCA_demo_without_labels.py b/demo/PCA_demo_without_labels.py index d54a1c0..4b1c0a2 100644 --- a/demo/PCA_demo_without_labels.py +++ b/demo/PCA_demo_without_labels.py @@ -14,35 +14,119 @@ sys.path.append('../') from loglizer.models import PCA from loglizer import dataloader, preprocessing +from collections import Counter +import pandas as pd + +# struct_log = '../data/HDFS/HDFS_100k.log_structured.csv' # The structured log file +struct_log = '../../Dataset_ML/Linux_matrix/log_matrix.npy' +mal_struct_log = '../../Dataset_ML/Linux_mal_matrix/mal_matrix.npy' -struct_log = '../data/HDFS/HDFS_100k.log_structured.csv' # The structured log file if __name__ == '__main__': - ## 1. Load strutured log file and extract feature vectors - # Save the raw event sequence file by setting save_csv=True - (x_train, _), (_, _) = dataloader.load_HDFS(struct_log, window='session', - split_type='sequential', save_csv=True) + # # 1. Load structured log file and extract feature vectors + # # Save the raw event sequence file by setting save_csv=True + # (x_train, _), (_, _) = dataloader.load_HDFS(struct_log, window='session', + # split_type='sequential', save_csv=True) + # feature_extractor = preprocessing.FeatureExtractor() + # x_train = feature_extractor.fit_transform(x_train, term_weighting='tf-idf', + # normalization='zero-mean') + # + # ## 2. Train an unsupervised model + # print('Train phase:') + # # Initialize PCA, or other unsupervised models, LogClustering, InvariantsMiner + # model = PCA() + # # Model hyper-parameters may be sensitive to log data, here we use the default for demo + # model.fit(x_train) + # # Make predictions and manually check for correctness. Details may need to go into the raw logs + # y_train = model.predict(x_train) + # + # ## 3. Use the trained model for online anomaly detection + # print('Test phase:') + # # Load another new log file. Here we use struct_log for demo only + # (x_test, _), (_, _) = dataloader.load_HDFS(struct_log, window='session', split_type='sequential') + # # Go through the same feature extraction process with training, using transform() instead + # x_test = feature_extractor.transform(x_test) + # # Finally make predictions and alter on anomaly cases + # y_test = model.predict(x_test) + # print("the result is:",y_test) + # print("the labels are:",Counter(y_test)) + + + # example without train_ratio + (x_train, _), (_, _) = dataloader.load_Linux(struct_log, window='sliding',split_type='sequential', save_csv = True) feature_extractor = preprocessing.FeatureExtractor() - x_train = feature_extractor.fit_transform(x_train, term_weighting='tf-idf', - normalization='zero-mean') - - ## 2. Train an unsupervised model - print('Train phase:') - # Initialize PCA, or other unsupervised models, LogClustering, InvariantsMiner - model = PCA() - # Model hyper-parameters may be sensitive to log data, here we use the default for demo + x_train = feature_extractor.fit_transform(x_train, term_weighting='tf-idf', normalization='zero-mean') + + # 2.Train an unsupervised model + print("Train phase") + # Initialize PCA + model = PCA() + # model hyper-parameters may be sensitive to log data, here we use the default for demo model.fit(x_train) - # Make predictions and manually check for correctness. Details may need to go into the raw logs - y_train = model.predict(x_train) - - ## 3. Use the trained model for online anomaly detection - print('Test phase:') - # Load another new log file. Here we use struct_log for demo only - (x_test, _), (_, _) = dataloader.load_HDFS(struct_log, window='session', split_type='sequential') - # Go through the same feature extraction process with training, using transform() instead - x_test = feature_extractor.transform(x_test) + # make predictions and manually check for correctness. Details may need to go into the raw logs + y_train = model.predict(x_train) + + # 3. Use the trained model for online anomaly detection + print("Test phase:") + # load another new log file, here we should know the basic set should be large as much as possible + # cuz for every vector, the same position may have different meanings --- can not be compared + (x_test,_),(_,_) = dataloader.load_Linux(mal_struct_log, window = 'sliding', split_type = 'sequential') + # go through the same feature extraction process with training + + x_test_original = x_test.copy() + # assert x_test == x_train, 'the training data is not the same with testing data' + x_test = feature_extractor.transform(x_test) # Finally make predictions and alter on anomaly cases y_test = model.predict(x_test) - + # build the tracing dict + x_y_dict = {} + # define the counter + i = 0 + for x,y in zip(x_test_original, y_test): + x_y_dict[str(x)+','+str(i)] = y_test + i += 1 + # print("the result is:", len(y_test)) + # print("the key names are:", x_y_dict.keys()) + # get the indexs of anomaly sequences + anomaly_sequence_index = [i for i in range(len(y_test)) if y_test[i] == 1] + print("the index of anomaly sequence is:", anomaly_sequence_index) + + # trace the index in the sliding_file_path + sliding_file_path = '../../Dataset_ML/Linux_mal_sliding_24h_3h.csv' + for index in anomaly_sequence_index: + # read sliding file: start_end_index + fd = pd.read_csv(sliding_file_path, header = None) + start_index, end_index = None, None + # get the start and end time from index value + start_index = fd.iloc[index,:][0] + end_index = fd.iloc[index,:][1] + print("please check log csv indexes between {} and {}".format(start_index, end_index)) + + anomaly_sequence = [] + for index in anomaly_sequence_index: + # anomaly_sequence = [var for var in x_y_dict.keys() if int(var.split(',')[-1]) == index] + + for var in x_y_dict.keys(): + # print("the var is:",var) + if int(var.split(',')[-1]) == index: + # print out the anomaly test_x sequence + # print(var) + anomaly_sequence.append(var) + + # print("the anomaly sequence is:", len(anomaly_sequence)) + print("the lables are:", Counter(y_test)) + print("the counter is {} and the anomaly rate is: {}".format(Counter(y_test), len(anomaly_sequence)/x_test.shape[0])) + +''' +For HDFS: +the result is: [0. 0. 0. ... 0. 0. 0.] +the labels are: Counter({0.0: 3951, 1.0: 19}) --- there are 19 anomalies +For Linux_logs: +Counter({0.0: 163, 1.0: 3/5}) 0.0184 --- 0.0307 +For Linux_mali_logs: +Counter({0.0: 127, 1.0: 25}) 0.1969 +''' + + From 78840f001dbd27f1c42eedb07300f033e5ba0cce Mon Sep 17 00:00:00 2001 From: Wapiti08 Date: Mon, 14 Oct 2019 14:34:50 +0100 Subject: [PATCH 4/8] Add files via upload --- loglizer/exec.sh | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 loglizer/exec.sh diff --git a/loglizer/exec.sh b/loglizer/exec.sh new file mode 100644 index 0000000..e2d0e01 --- /dev/null +++ b/loglizer/exec.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +path1='../../Dataset_ML/Linux/Client/Client_train/structured_log.csv' +path2='../../Dataset_ML/Linux/Client/Client_train/Event_dict.pkl' +path3='../../Dataset_ML/Linux/Client/Client_train/structured_log_id.csv' +path4='../../Dataset_ML/Linux/Client/Client_train/Linux_matrix/log_matrix.npy' +path5='../../Dataset_ML/Linux/Client/Client_com/structured_log.csv' +path6='../../Dataset_ML/Linux/Client/Client_com/Event_dict.pkl' +path7='../../Dataset_ML/Linux/Client/Client_com/structured_log_id.csv' +path8='../../Dataset_ML/Linux/Client/Client_com/Linux_matrix/log_matrix.npy' + +python3 matrixgen_client.py --p1 $path1 --p2 $path2 --p3 $path3 --p4 $path4 --p5 $path5 --p6 $path6 --p7 $path7 --p8 $path8 + +exit 0 From dee0edbd17bfa08236f14dabc998aeba7a258397 Mon Sep 17 00:00:00 2001 From: Wapiti08 Date: Mon, 14 Oct 2019 14:35:57 +0100 Subject: [PATCH 5/8] Update dataloader.py add the version can process the dataframe with the decreasing order of timestamp --- loglizer/dataloader.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/loglizer/dataloader.py b/loglizer/dataloader.py index 3f203e2..83b2f69 100644 --- a/loglizer/dataloader.py +++ b/loglizer/dataloader.py @@ -350,9 +350,10 @@ def Linux_preprocess_data(para, raw_data, event_mapping_data): start_end_index_list = [] # list of tuples, tuple contains two number, which represent the start and end of sliding time window # get the list of label data and the list of time data time_data = raw_data - + print("the time_data is:", time_data) if not os.path.exists(sliding_file_path): start_time = time_data[0] + print("the start_time is:",start_time) start_index = 0 end_index = 0 # finish the comparision in one roll with window_size @@ -364,7 +365,8 @@ def Linux_preprocess_data(para, raw_data, event_mapping_data): start_end_pair = tuple((start_index, end_index)) start_end_index_list.append(start_end_pair) break - # sliding the block and change the index of start and end + + # sliding the block and change the index of start and end while end_index < log_size: # add the sliding size to start time start_time = start_time + para['step_size']*3600 @@ -428,4 +430,3 @@ def Linux_preprocess_data(para, raw_data, event_mapping_data): - From 03263a743810c4568f8d4fbf948f3dfc03ea7909 Mon Sep 17 00:00:00 2001 From: Wapiti08 Date: Mon, 14 Oct 2019 14:36:36 +0100 Subject: [PATCH 6/8] Update matrixgen.py --- loglizer/matrixgen.py | 199 ++++++++++++++++++++++++++---------------- 1 file changed, 126 insertions(+), 73 deletions(-) diff --git a/loglizer/matrixgen.py b/loglizer/matrixgen.py index a572a62..cfde401 100644 --- a/loglizer/matrixgen.py +++ b/loglizer/matrixgen.py @@ -5,6 +5,7 @@ import re from dataloader import * import joblib +import optparse # function to transform hours and minutes to seconds def trans_seconds(time_list): @@ -81,46 +82,90 @@ def Event_Convert(fd, filename): # define the window_size and step_size to get time sequence para = {} - para['save_path'] = '../../Dataset_ML/Linux' - para['window_size'] = 24 # 24 hours ---- one day - para['step_size'] = 3 # 3 hours + para['save_path'] = '../../Dataset_ML/Linux/Client/Client_train/' + para['window_size'] = 0.5 # 24 hours ---- one day + para['step_size'] = 0.2 # 3 hours + + # =============================== generate the event matrix for norcom linux logs ========================= + + # set the format of command input + parser = optparse.OptionParser('usage %prog --p1 \ + --p2 --p3 --p4 \ + --p5 --p6 --p7 \ + --p8 ') + # set the elements for every parameter + parser.add_option('--p1', dest='structured_log_filename', type='string', help='Please input the structured log filename: ') + parser.add_option('--p2', dest='dict_filename', type='string', help='Please input the dict filename for training data: ') + parser.add_option('--p3', dest='structured_log_id_filename', type='string', help='Please input the structured log id filename: ') + parser.add_option('--p4', dest='matrix', type='string', help='Please input the location where you want to save the matrix: ') + parser.add_option('--p5', dest='structured_log_com_filename', type='string', help='Please input the coming structured log filename: ') + parser.add_option('--p6', dest='dict_filename_com', type='string', help='Please input the dict filename for testing data') + parser.add_option('--p7', dest='structured_log_id_com_filename', type='string', help='Please input the coming structured log id filename: ') + parser.add_option('--p8', dest='matrix_com', type='string', help='Please input the location where you want to save the coming matrix: ') + + + # parser arguments through the parse_args() + (options, args) = parser.parse_args() + # get the values from options + structured_log_filename = options.structured_log_filename + dict_filename = options.dict_filename + structured_log_id_filename = options.structured_log_id_filename + matrix = options.matrix + structured_log_com_filename = options.structured_log_com_filename + dict_filename_com = options.dict_filename_com + structured_log_id_com_filename = options.structured_log_id_com_filename + matrix_com = options.matrix_com - # =============================== generate the event matrix for normal linux logs ========================= # get the linux dataframe - fd_linux = pd.read_csv('../../Dataset_ML/Linux_2k.log_structured.csv') + fd_linux = pd.read_csv(structured_log_filename) # make a copy to avoid modifying the original data fd_linux = fd_linux.copy() - filename = '../../Dataset_ML/Linux_matrix/Event_dict.pkl' - # check whether the event_dict has existed - if os.path.isfile(filename): - event_map = joblib.load(filename) + # dict_filename has been given by parser + # check whether the dict_filename has existed + if os.path.isfile(dict_filename): + event_map = joblib.load(dict_filename) else: - event_map = Event_Convert(fd_linux, filename) - - for i in range(len(fd_linux['EventId'])): - for key, value in event_map.items(): - fd_linux.is_copy = False - if fd_linux['EventId'][i] == value: - fd_linux['EventId'][i] = key - - fd_linux.to_csv('../../Dataset_ML/Linux_2k.log_structured_id.csv', index=0) - - fd_linux_id = pd.read_csv('../../Dataset_ML/Linux_2k.log_structured_id.csv') - fd_linux_id = fd_linux_id.copy() - + event_map = Event_Convert(fd_linux, dict_filename) + # shift the key and value of the dict + event_map = {val: key for (key, val) in event_map.items()} + + #for i in range(len(fd_linux['EventId'])): + # for key, value in event_map.items(): + # # print("the key {} and value {}".format(key, value)) + # if fd_linux['EventId'][i] == value: + # # replace the hashed eventId into format like numerical id + # fd_linux.is_copy = False + # fd_linux['EventId'][i] = key + # print("the replace eventId is:", fd_linux['EventId'][i]) + + + #fd_linux['EventId'].map(event_map).fillna(fd_linux['EventId']) + fd_linux['EventId'] = fd_linux['EventId'].map(event_map) + + # structured_log_id_filename has been generated above + + + fd_linux.to_csv(structured_log_id_filename, index = False) + # read the saved csv + fd_linux_id = pd.read_csv(structured_log_id_filename) + # sort the dataframe from time increasing order + fd_linux_id_sort = fd_linux_id.copy() + fd_linux_id_sort.sort_index(axis=0, ascending=False, inplace=True) + # reset the index + fd_linux_id_sort = fd_linux_id_sort.reset_index(drop = True) + print(fd_linux_id_sort.head()) # part to transform the month, date, time into seconds - month_list, time_list, day_list, day_list = [], [], [],[] + month_list, time_list, day_list, day_list = [], [], [], [] - for i in range(len(fd_linux_id['Time'])): - time_list.append(fd_linux_id['Time'][i].split(':')) - for j in range(len(fd_linux_id['Date'])): - day_list.append(fd_linux_id['Date'][j]) + for i in range(len(fd_linux_id_sort['Time'])): + time_list.append(fd_linux_id_sort['Time'][i].split(':')) + for j in range(len(fd_linux_id_sort['Date'])): + day_list.append(fd_linux_id_sort['Date'][j]) month_number = 0 - for k in range(len(fd_linux_id['Month'])): - # print("we are transferring the month:",fd_linux['Month'][k]) - month_number = month_string_to_number(fd_linux_id['Month'][k]) + for k in range(len(fd_linux_id_sort['Month'])): + month_number = month_string_to_number(fd_linux_id_sort['Month'][k]) month_list.append(month_number) seconds_list = trans_seconds(month_list, day_list, time_list) @@ -130,6 +175,7 @@ def Event_Convert(fd, filename): event_mapping_data = [] Event_ids = [] # get the digits part of eventID + Event_ids = [int(x) for x in fd_linux_id['EventId']] for id, log in zip(Event_ids, fd_linux_id['EventTemplate']): @@ -140,69 +186,76 @@ def Event_Convert(fd, filename): event_count_matrix = Linux_preprocess_data(para, raw_data, event_mapping_data) # print("the event_count_matrix is:", Counter(event_count_matrix[9])) print("the event_count_matrix is:", event_count_matrix) - matrix = '../../Dataset_ML/Linux_matrix/log_matrix.npy' + # matrix path has been generated above np.save(matrix, event_count_matrix) - # np.load(matrix+'.npy') - # =============================== generate the event matrix for malicious linux logs ========================= - para_mal = {} - para_mal['save_path'] = '../../Dataset_ML/Linux_mal' - para_mal['window_size'] = 24 # 24 hours ---- one day - para_mal['step_size'] = 3 # 3 hours + # =============================== generate the event matrix for comicious linux logs ========================= + + para_com = {} + para_com['save_path'] = '../../Dataset_ML/Linux/Client/Client_com/' + para_com['window_size'] = 24 # 24 hours ---- one day + para_com['step_size'] = 3 # 3 hours - fd_linux_mali = pd.read_csv('../../Dataset_ML/malicious_linux.log_structured.csv') - fd_linux_mali = fd_linux_mali.copy() + # structured_log_com_filename has been give by parser + fd_linux_com = pd.read_csv(structured_log_com_filename) + fd_linux_com = fd_linux_com.copy() - filename_mali = '../../Dataset_ML/Linux_mal_matrix/Event_mal_dict.pkl' - # check whether the event_dict has existed - if os.path.isfile(filename_mali): - event_map_mal = joblib.load(filename_mali) + # dict_filename_com has been given by parser + # check whether the dict_filename_com has existed + if os.path.isfile(dict_filename_com): + event_map_com = joblib.load(dict_filename_com) else: - event_map_mal = Event_Convert(fd_linux_mali, filename_mali) + event_map_com = Event_Convert(fd_linux_com, dict_filename_com) + + for i in range(len(fd_linux_com['EventId'])): + for key, value in event_map_com.items(): + fd_linux_com.is_copy = False + if fd_linux_com['EventId'][i] == value: + fd_linux_com['EventId'][i] = key + + # structured_log_com_filename + fd_linux_com.to_csv(structured_log_id_com_filename, index=False) + + fd_linux_com_id = pd.read_csv(structured_log_id_com_filename) + fd_linux_com_id = fd_linux_com_id.copy() - for i in range(len(fd_linux_mali['EventId'])): - for key, value in event_map_mal.items(): - fd_linux_mali.is_copy = False - if fd_linux_mali['EventId'][i] == value: - fd_linux_mali['EventId'][i] = key + fd_linux_com_id.sort_index(axis=0, ascending=False, inplace=True) - fd_linux_mali.to_csv('../../Dataset_ML/malicious_linux.log_structured_id.csv', index=0) + fd_linux_com_id = fd_linux_com_id.reset_index(drop = True) - fd_linux_mali_id = pd.read_csv('../../Dataset_ML/malicious_linux.log_structured_id.csv') - fd_linux_mali_id = fd_linux_mali_id.copy() + fd_linux_com_id = fd_linux_com_id.copy() # part to transform date time into seconds - month_list_mal ,time_list_mal, day_list_mal, day_list_mal = [],[],[], [] + month_list_com ,time_list_com, day_list_com, day_list_com = [],[],[], [] - for i in range(len(fd_linux_mali_id['Time'])): - time_list_mal.append(fd_linux_mali_id['Time'][i].split(':')) - for j in range(len(fd_linux_mali_id['Date'])): - day_list_mal.append(fd_linux_mali_id['Date'][j]) + for i in range(len(fd_linux_com_id['Time'])): + time_list_com.append(fd_linux_com_id['Time'][i].split(':')) + for j in range(len(fd_linux_com_id['Date'])): + day_list_com.append(fd_linux_com_id['Date'][j]) - month_number_mal = 0 - for k in range(len(fd_linux_mali_id['Month'])): + month_number_com = 0 + for k in range(len(fd_linux_com_id['Month'])): # print("we are transferring the month:",fd_linux['Month'][k]) - month_number_mal = month_string_to_number(fd_linux_mali_id['Month'][k]) - month_list_mal.append(month_number_mal) + month_number_com = month_string_to_number(fd_linux_com_id['Month'][k]) + month_list_com.append(month_number_com) - seconds_list_mal = trans_seconds(month_list_mal, day_list_mal, time_list_mal) + seconds_list_com = trans_seconds(month_list_com, day_list_com, time_list_com) - raw_data_mal = np.array(seconds_list_mal) + raw_data_com = np.array(seconds_list_com) - event_mapping_data_mal = [] - Event_ids_mal = [] + event_mapping_data_com = [] + Event_ids_com = [] # get the digits part of eventID - Event_ids_mal = [int(x) for x in fd_linux_mali_id['EventId']] + Event_ids_com = [int(x) for x in fd_linux_com_id['EventId']] - for id, log in zip(Event_ids_mal, fd_linux_mali_id['EventTemplate']): - event_mapping_data_mal.append([id, log]) + for id, log in zip(Event_ids_com, fd_linux_com_id['EventTemplate']): + event_mapping_data_com.append([id, log]) - event_count_matrix_mal = Linux_preprocess_data(para_mal, raw_data_mal, event_mapping_data_mal) + event_count_matrix_com = Linux_preprocess_data(para_com, raw_data_com, event_mapping_data_com) # print("the event_count_matrix is:", Counter(event_count_matrix[9])) - print("the event_count_matrix is:", event_count_matrix_mal) - mal_matrix = '../../Dataset_ML/Linux_mal_matrix/mal_matrix.npy' - np.save(mal_matrix, event_count_matrix_mal) - # np.load(mal_matrix) \ No newline at end of file + print("the event_count_matrix is:", event_count_matrix_com) + # matrix_com has been given by parser + np.save(matrix_com, event_count_matrix_com) From 7ebb2b4aaf5c6e440e497c897333bba641734dbc Mon Sep 17 00:00:00 2001 From: Wapiti08 Date: Mon, 14 Oct 2019 14:36:59 +0100 Subject: [PATCH 7/8] Update preprocessing.py From b7af83c959c5b84e7291e66e838b6bcc9d390bca Mon Sep 17 00:00:00 2001 From: Wapiti08 Date: Wed, 13 Nov 2019 16:22:30 +0000 Subject: [PATCH 8/8] Update PCA.py in line 67: i is not assigned before reference --- loglizer/models/PCA.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loglizer/models/PCA.py b/loglizer/models/PCA.py index 6d6a437..22ebf5a 100644 --- a/loglizer/models/PCA.py +++ b/loglizer/models/PCA.py @@ -64,7 +64,7 @@ def fit(self, X): variance += sigma[i] if variance / total_variance >= n_components: break - n_components = i + 1 + n_components = i + 1 P = U[:, :n_components] I = np.identity(num_events, int)