From d43ed4658adf695a39256fe6a2bbff1f3514732b Mon Sep 17 00:00:00 2001 From: matt Date: Sat, 22 Apr 2023 13:00:24 -0700 Subject: [PATCH] add progress and better scraping. --- .gitignore | 1 + docs/progress.md | 66 ++++++++++---- docs/progress_spec_3.docx | Bin 0 -> 15007 bytes src/bias.py | 180 ++++++++++++++++++++++++++++++++++++++ src/join_bias.py | 46 ---------- src/scrape.py | 42 ++++++--- src/word.py | 25 +++++- 7 files changed, 287 insertions(+), 73 deletions(-) create mode 100644 docs/progress_spec_3.docx create mode 100644 src/bias.py delete mode 100644 src/join_bias.py diff --git a/.gitignore b/.gitignore index cef562a..d7d7bff 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ *.csv *.swp __pycache__ +tmp.py diff --git a/docs/progress.md b/docs/progress.md index 0ad60de..2ffbac9 100644 --- a/docs/progress.md +++ b/docs/progress.md @@ -1,31 +1,39 @@ # Data Mining - CSCI 577 -# Project Status Report I +# Project Status Report III -*2023-04-04* +*2023-04-18* ## Participants Matt Jensen -## Overarching Purpose +Computer Science 477/577 +Project Status Report III +Due: Tuesday, April 18 -I hope to use a dataset of new articles to track the polarization of news over time. -I have a hypothesis that news has become more polarized superficially, but has actually converged into only two dominate views points. -I think there is a connection to be made to other statistics, like voting polarity in congress, or income inequality, or consolidation of media into the hands of the few. +## Tools -## Data Source +> The third project progress report should include a preliminary account of the existing software tools you will be using. +> Ideally, you obtain the software you will (probably) need and run it on sample files (or your real files), so make sure that you understand how they work. +> Do not wait verify that there are no hidden complications. +> The are many plausible sources for such software, including the following: -To test this thesis, I will crawl the archives of [memeorandum.com](https://www.memeorandum.com/) for news stories from 2006 onward. -I will grab the title, author, publisher, published date, url and related discussions and store it in a .csv. -The site also has a concept of references, where a main, popular story may be covered by other sources. -So there is a concept of link similarity that could be explored in this analysis too. +I will use the following suite of python tools to conduct my research: -## Techniques +- python +- pytorch +- scikit-learn +- duckdb +- requests +- pandas +- matplotlib +- seaborn -I am unsure of which technique specifically will work best, but I believe an unsupervised clustering algorithm will serve me well. -I think there is a way to test the ideal number of clusters should exist to minimize the error. -This could be a good proxy for how many 'viewpoints' are allowed in 'mainstream' news media. +## Purpose + +> This progress should also provide a definitive description of your purpose and how you intend to conduct it. +> This should take the form of a detailed outline of the procedures you will undertake in exploring your dataset(s) and maximizing the knowledge that can be extracted from it. \newpage @@ -103,3 +111,31 @@ Another goal is to look at the political alignment over time. I will train a classifier to predict political bias based on the word embeddings as well. There is a concept of the [Overton Window](https://en.wikipedia.org/wiki/Overton_window) and I would be curious to know if title of new articles could be a proxy for the location of the overton window over time. +\newpage + +# Project Status Report I + +*2023-04-04* + +## Participants + +Matt Jensen + +## Overarching Purpose + +I hope to use a dataset of new articles to track the polarization of news over time. +I have a hypothesis that news has become more polarized superficially, but has actually converged into only two dominate views points. +I think there is a connection to be made to other statistics, like voting polarity in congress, or income inequality, or consolidation of media into the hands of the few. + +## Data Source + +To test this thesis, I will crawl the archives of [memeorandum.com](https://www.memeorandum.com/) for news stories from 2006 onward. +I will grab the title, author, publisher, published date, url and related discussions and store it in a .csv. +The site also has a concept of references, where a main, popular story may be covered by other sources. +So there is a concept of link similarity that could be explored in this analysis too. + +## Techniques + +I am unsure of which technique specifically will work best, but I believe an unsupervised clustering algorithm will serve me well. +I think there is a way to test the ideal number of clusters should exist to minimize the error. +This could be a good proxy for how many 'viewpoints' are allowed in 'mainstream' news media. diff --git a/docs/progress_spec_3.docx b/docs/progress_spec_3.docx new file mode 100644 index 0000000000000000000000000000000000000000..c065b5fddc6345416190d7745e49efe59dd6d290 GIT binary patch literal 15007 zcmeHug;$-q_V&RYio3gefuhCT-QC^Y9a`MoU5dNAySuhH#rP(?bu~!EFfWI86F^*T=STOWDJ;INwtJRtal+jM(0lSi_*!VpB(btlttsPVHBM<{|eQ)fz0<=teUU~qk>OJ5qRYH_9$)^EK1b*)b%!?=O zb3ci5q^9_yl>j&hdfBibvE}1bZ#EJ{*qW1Ov+`PTGJXd2a6zXI$0Hf>4hag;=F{$G zcvlg)NPNj+>Om0a7KE16VpKx|#M##BxqcNMs>(J^Re7``D|p+-(itRmuT!+#BwtY% zYpH4F@+s>{pZH$d6BHs!dW0Eu7k=FpjIL2+mX7tyXe6jgplXLRLDlZ-D|3|2G0x9a zBN?G3exn9;3mG56762@pugf?ZX$moO9A9||DiZFazTJ7`DIc=f`X*ohP+264y;;8c z+#ZWk46^LmY8b-%u32wyAOP9FO;5ZStj6>A9O?Jj2lGBXb?l8S9q8$PlmDBR|A*(| zU%q;IZ1?-LgylOAcnO$pms{z^$djfwm|DeHhJe%%lSE!!GGBas;aXY(*7~6*GCDIG zH|6e-F6^`wuXTZys00_@4z=*0-K+7`<_bs%YAaxLU9jDOO*^!NM?$UU#Ks6Q--1}X`BdE@&^md*nJAifWZi;cY@ zy`hbPlhykg_S@ohqM>H9%Z}u&qx%N3U1XhYkkMdh5bNhT4xWq)nn81$5@Q;HEQTUV z_4R4dCILQ6Ox^D8x~_d#fF@>VP=8%WuJidZo@_5htY64^`CE%|NHq-#;v`y9#@pIB z-7~ygsxhLJ-~b&bb>XK@?UyV3PMVMy$cT8ORDfV%@F-W?NuT&$fRWzmqA#v|V+S;~}0>~73hRtN-C`Bjpri&6>b&?-Q;KbP$fs6^M+*S{=*Xs@C3-luSx9A?Y^*pC0?HeNG*bQ<%#AFB{IPwC}3tkH7*Z8E95=#*^ zY>szp}L+;oX0giY0`G^0!Lx5NjUde>1Aj}F@rL;lzVT6+8ifHOwH?u8iUl= zt}88zbrwAbb&|*GL*ufN^b6sg$t=QaUU}PVLUt)KPVk12QiY8prJhuX=*Ymz0BA;r~2O9ZQLNL4Nl0y%ErzOqYQu^_}hSj=AyTpo* zd1UjdRU#iFn_@g*c9XF{ba3-B zu8Urh!>~5bE24*c8aGF$YU-Rv{7_tUsg8E>dg9#oBeX2G$KvlBCRLE+shOD1*A=i= zVL+XpCr8?gCEwFTnfmFX?0<&G=^Q|2$cWZAxM4lTB7)3YJy5&oZ(M-FjKPCd7ny(4 zONxq`@BlC1b11$E(1Q`XYEsu%&YzFta+6Fw9CO-qnyXX0`MC%(L55>vcWQZZMrSNI zs3}ZR)eCs0%ho_t0|Q2lAPR$=?8y(s$hiPWxQ;MWsRRb+P%ugL;mBg>$z-!fYa@*b zBq;(ndB(q|YMOwM!V%*B-22Z@8E0V=>_c zXkHV@?06Jq8X)4Y4ILsP#!1dX6q?0hXFP=Z@c3C2hy1fYOUVk4H?U~z41_P8EDhs4 zsdJ2wWTHnQ37SN@hfubnD*#9ebU2Vq0$PEKY51_r9D@r4QdE;nbSy8_2$U@v2~XJr ze;wV(1KyuT3S0#NKpjhtf`Tez;j*7j&aPSb`1+ z?xpiXp-0CLxG`?GrtYB{Pm62ysN! z+OCZ8a&Dzl4h0vM?;M*7N^FnUO1L|irA9pmzGuq?8{-_H*_)$&OZGdZXjS%Pdiw~1 z@OB}}+9Jub2nGzggeuASNhEA-mHazP$2V4o`_cDR;5R1n(rdno0=>m3b!k<$)QxN5 zn~M{y3feRC9X}>D7c)+dO3gI9c`=fnIb)6vQFS~fUT@UQ(sxp^ik={bY?K}~7X}tT zy(}-?37=(F-Lz$&gBK({>6vxMbbGBbYE*iB^={I9!hF)Z;CMJ8(5qHJi@_duj=70c}9)jH~2!{gQJF z+2ixs#Tn1hRfD;ViNWCMX%Ldl(S7a4UhB${$KC)xtL*>}iG)YY>KP}p&4t08!6xzD zgUH!i4a5qWep+58_jib=Dk#ZG=k{UimxqbwPo3lt3&r`Fv>|rYysF+N|m(s9bf> zRt(9TzGRO9sv}ZG=xdRZs^FQ5A)%mPDt*TYGZg3f7U8R?H1fRFejCneB~vRc@k!Ll z20`^^#$tqL?EnbE5z^ZD#zb6&*_P^y3L8piVl=EVcc-MIx8zSx>A_{`IXKPreod>1 zf_aFUE?=;ZJU@Rbs`njm?WXcFH7VxoajbB6FwjTw?+1;6xqdmwM$YuwNinu91A`Jm zL(L)6bb+EYDI=eD)i3>2N2~za8^*E^#f~LsDrufpYqLLyDhEVj4R4iDR3;9aiFg}N zzUA5ll7O;3D&Kz%ar2FYsz8QoUnf<7A0b7xha}>$SXq>rixWaWmPL@BEs03TUZaV@ ziEy^#<(7vCk=KRZ^GOyo2oY=nKbp9t~{5CO7qMH_I+Fn_(%uxJf85H*m&S z6^S~Prgjlmv~tImjpc9#O=k~M!%6h7QiyU9ZIz&i5%fES3?%mzF$+ZNYc>gALI+I) zgG?L>1w(7#D{Rn%Do1DRTRu$YD=Qs4*T7)*+92_|QANySSN39To+P#l*Pi{Xc|03l z@>KS|`R850g>i?Q)I0wvf8V6z03d;WXF31c3jCGp{IM+ndf%P*SS1ZbWv zH#5hvV7)%zsgmJ*I{jN7AH211U3pU}vcdvm<`#+gcat7xBtt$u12UYO1( zfnmwSZ#7S|qr-rKqE-STm+4TEzFf{Csd$OI^5x$?w)cLVEwEiJ;uj++%GW-_{iwQW zyDz`P=V#lJGksmc>FIIRZ%A@kCv2#X@8x$$2G|Uu?QNCCbc<07(0dtT+f9%k>O? zZS+(jZ*J^>L6l{)L<@Z23~Tw#Wm^?qd)2k*ZnOI-P5%N^K5R_6#bPCU16XaeAVoy$ zZTqfXft5yN01a@mVYS61VQtl^&-1iW^&oOGzr>P>C4%E=;MEW9`VszV*d||r7V@gDCy9hl$8s(MQTz&BbL-4CAJEF$vT(P!IIU<0WMaI(t^@a+n9~Cuo&pLTB~wC&Q5RUPG*4*Xsfa7vM;nJXyDWN zl_#cL>R2t6haX=Ge89Rqk=ab)QUeZVvVeUVP?*P?zw>C^bZO~IWVe}KtlQjRv%=~8 z++6B<%6v9;1sjPt1NUZ!Rc55|`hOq^ka4BK?j z;-kNQgTzqs7PheKphHN-BX9@5l0+GGu}alM zRvVYk&*O*LqC**P?Z|dB7Z5|c<+D=+otK+uhvP||4xgtJ3+Bv(OB?UUopGDaw>Xc= z>Bocqv?O|MpQn9%ULNKyFKA(-dq#1>TI+6wV>+>ZP&kk-L@&08UC`{BpsapaL%tP^ zDUeLl?-~xien5)>?l|WPgy>{jH_HNFF68)4a4>z@;uJyfko-LYu?`&-zAAm~p@|p| zm{r4cLH9+TUFud(gs3f1y&KwN63rIAf=8o-U>qZ8G{jHgGiC7bjYeA(K#}hmO4dM# zo(CDih&5oDQJx+74lXXFeL&_L;gS5gNlRGS(IbhSh#%#r7J7=aN7W zi*H;o@1HGU0C>*RO(f=Or3-S|ST!g|=}Fv`qLmHq`}@{;p(7P^J0ih>gegzE0zt9X zW>kO2P^~L?s-T%$B%;=TQ{#KB-frd~{c=g~Dzzg0Nmj0HGSJydZnQ>z zImx0P{J0h<+jOVXXnNomn|LE}%yW>$J%;&&NnRlwd6+!K@*8_B+lC$Gvc^3eURitG z!`_JWM=Em6uY(MG8tTeQ(@C6+x7tWF)P`iD_@vV%tUb)`=%IXW6{)bgffzc%%=~qA zO)7Glab^9=bKa~PR&u^YP^#$^MvRl&rEU2NsUi9;Jk;$S$zv3WVlCGt9^Yb4D8ddB zK?E~XjbB`SLOjFf#qU^^n;okxw2G!$tcv>f5dx9w=XJ4IhnM)Q=1Ub;w}Y}-*6hu! zIf#Ftc`H-63 zQ7Az_i!!VsEIOA@-g!;`%_}6{l;caIsQMmh;lWrsCfs3eo5$PGcioU6lD1U#*PugK zTcwLw#k8=i8*dKDbU(#nD#U%%Om+yGj5`&d*Guax@p6>nVIWyWCZqnoWh$kjO6mw& zV^Qr3EEPayv#w}&-cl26SV$orTKc?bOuB3?y!DowK~df|auK4^iX1E^C1{doMV;8l zqJ{j{*`Ru8(!G@wTAL|GW&GuLXh}kF?P1!I0waUY5%8^r*c0ztBu?Rqjg33)ixv;M zk-UD1o>}nvEwDr+x==Z6f)e+^ZRXs(bC`?u_S_s`hfU_9*`d5eM9@jd0Wfb*T#*^O zCSA7?fsw__$SAISnR4FOdR=Kq!;+EHGF9L-6_r(@paxOAGEwH8MKDy{AMoXqG5Pc* zsc0?4&3w71!Dt)5+$$}&Pk^G>x1Im{SYPquk8_!lnPrc&N;%{PuXFmIQ`J3 zisKmHj|QbkK~sZ;+V$};0V!R*J=SN>9Ns~MPs0(-82YBDJi}C1ju5=+(NMg?HO$HI z3juv6vXySz({#xwKCWGImw)`db`iS*M0nB%PvMSjGbL}+e4;K><9Qpwi&}tf0k@BT zZWHgEYgmjqft1bO*jis~th-R))=mRe&c}^0Otq?S00>ZXE3kb5(9JZiOhB0{tUX&68E_8Y5r8|K5Ud8sFja(8qBf6y#Fdq0)G95!Vd@UA zChX1BdQ!8nA70?D=Yw{ek8Rzh9>JmCOki3)L%lwa)$E!aGqS@G+#DtoQ$9?~j?>_4 z7&=3$*$y^V|E-RsMtiHCV-^)DEwgdir+6|ovBFD$2o zdQty%L#89wpgiV}f-;g)L!pmEJ0~MS!AVi|ZxyywfVk z!$1~$yk*14qEXpaNgzG%RkV#iJPa&aU9uZrkT>(x8GJiUbq#WROu`p;HB%!!`iOFx zBO?Q9Y>Q>I!c1S(Yrv^#lzPWO{7-H=8~5jwVk!T%TrBz(II#VGjn9oWd%8(GB1=9^ zB+xv?7V=Ej#fy3j3kMk=g&;PH$y=~yM{X3rUrMIn2=2nZQC{NfPp8!q$bMb++kfG2 zGX*IU?71k_bD^T6xlQVRW}tPmD_itfqFuGc1G_9fmE>jOR7rUNzk3C%G7;UW=`A;# zOZ7$=TpTsTqYu7(UcnE}!G|r1kYh+6&trgWGh_Pt;?qi2|E8h&0`X7ew$S3U_NR9% z6#hv5M}=Z*!}>e~>M2{X*b>H=hK=wM)l z$pUXxL~9q;KGHXgg7obar>^Fko9#uNjuZZd90{7at{DAlt~I0SG4cE{e!x$PZj}AI zctQU|PttQ@X&{{n(ZKGn5g6mp=Qwh$*A`rY^~Bkaq1gr2tpzO6$v6QKYZ;*|8=hn`V8kH3 z#ZGFQ*@9SAb-dH!;CQOn-uuSXc&l!+#=M!b%OmBWCX@Q{M2q7kMFcxMg~k~UGQHGm zk48pxK@q}STp>dYZrRzYE3xpVNekRVyZ%mm)xix`L^tcAtTxXK(~S*rg2U-@^Rvo}Q(4&LKTv(Qo&#`SiRlr@lEL9;h{2+RFpeK{{ zf+-o94{wwmQ_$|n3(TD^mg2Kp-1pSb5HMD2TL||Wk>y7N^lg`Jw&fUM9$^oT#J&t? zauC(|pik|44}<;>oS#!l<5{3}79wHlB$Wh@+#?l(qr#D=e3Wv(i(Dd}M>u;e5NjEf zLl~7!F(m;D{j7q+&G7P&Yc4rS$qfRgmLYRA_|b)@f)vr*^<4^K8YS`AsfSh`44ZCE z#AJIJZHn75PW&gXjoqXBkS#YE2Mt)*+niU)8TFueYQYWU`SL5;*rE4woV62#TohBW ztZ|7u=Sn7by_JVtpKRX}sN94?q21@S0SuAATyw^LccrSxmUA}O&AKw@-T(KSb_$+=dxq;)X&VSicw%DOpC-&ai79b<$KC<`Y~i_JSb7Lp@U{K z`4u{PcQ_M|k}D&ez$1b(7fWtiK#NHAwCjrR^4y25zkT)b&Cg0Fa@LCRkcuHV-t!WJ zzn&!R&JH2D6Msw*JCYQ!xq#yMtZU1^kwEg)1?5D#?pjS!(8V`%KN~xNpXr$CA;;=Y zn(@XV#)j$1Z?KwABgsSm!+$%L4BaRvDSvA=*7_6I56|;lmbN&@tuY`|@gnXi`r__e z-I|Mz=BP<)%<>X$$gSDk*)^Ed3RC!VLT>QH7 zl{Txyky0Z!7u&a?}vVEK(ccm`#0i+CAWUYDW%2`*TCo`aK54_#4;Wt*Ff*ud80(i?u?)At9hYau|`DmWzh5Rqq6QMgg@5eCYrFQzxh!vOkzG&Cp?x#@h~Y3aBl0(j zje(G-r#QJU!;i9io^<@)w7pE*W_l-sm{F*5g%A8RSE#222&Y2E(@b%MlMeIWNh)2T zkj(-(*uk>}15?KK+wQZm6c&fF;~%;^9;m|>eUWjtJ^;uFefk+E%m-A}R&Wrv+2ewV zi6VAYFkzxagzEU@;8k8eEi{zt!X(mL5)CRH<#)G}wnW8VJfBe}V5aql=eOZe41$4| zzbTzAzNPR!lr2}LF+6IS5yoz_X4Nv$*}pJ*3>{_N&|htuq@cN7*iZbF4HZ%PtRzmP z=+~HJMkj2JVB%(%0<+NKkn?7`j(iYQ{}312XId`dd7xJ6{HA}||Gj!8{;hF&%CdPT zVPy%e_S&reOOXd}q4R#Z6tmJ?f2fhitb~E9ttVatxpI68H0)y%q2eWRM5W@>gN0m; zDn3^wXW`@KQRV(Ey}~FdOi&3*8oFjsSio>csHp&PB9(>S9W<;fM4qR1Xv(I+e=RyzI`+^VH>jDqh^ZgIl<9!d(sj3c_VxVzZF?=MmDByn&IQ7P zt5Mw?%thSTj8WCdfigl7jcY1mqnjm>DpAbN%+-Bn7gLNFib2V+jC`lNE;?niRU{O# zh4BAg{h*$gOf zvy?H&frD7?m~XCFh%TGqqDUyIKovg^gV4eVFy^3Om^s?aV`KV5=!x(*RIrddsdkf* zzje&OGWKF+Vq=V5%ysDon}CBeb1_zX3eO9%ekm?8$C3+~XUX+08z5Ke{QVy@!58J( z;Jej43Vr0AQ>DSrZw@Xh-{#Os!50mvGUXkVof%4aOt84BBe{(%n<1~&PMyowjV(E{ z46o6+C)%`eH^8TfWLYVRwYf@l z!E+LI3TR35{?1J+s*Ik26<`-?h8!udE8jqg@5{6&%w6OKJDg}kR4m7_=*muu1=ZWJ z#AHWRgC|+^?r*JznG+JZ5y-p}?Pzpd1x3=)QpyKXX&(f3W3^*D#S&o>VdAK*2p(l_ z%2qM`JsNP&lSn5I|Ra`!xriZc{ z4cDkUelV5j??*@Vfm9X^;sWKgA_c-Z3xHJ>IjzlIvs(%u`_nM!Vu zGCrDI38KPRgQp2J+#)+*6nhX)`;n^$92?Kc11cnP8!PtL8Ut9VUU zSCNN-K^_9OQ(({^u2bqap;F|JNDD-RG0#q%@fue7Qa2)Vi%X#*&?G{u&Y#%uoKC-Y4uX#))SAFeg^NG$w;UksL4p=&BBe+ackC|G&>Qb zJAcU{8+3r}n5~qX!)QcXWsSxfZ6I`AVGaM7kE9u#)1eM%pOs;@e)8V2TQyMJE7aZF zE4*Gvz@8?pK7US{V&)_>P5qhLO3t^&7V%v>=4w)2`S6sDj{U}`_*j}YeI^~EB0cXF zz4BvjWGlW$2sT@;%bFdrI|Kn+t_q2LY94=V+6?E%X2kY@cx=C~%C^E?Le!>F5z12= z2uX=^@r2>tqRmY3G2b!PtOF6?M-@6-7V8pb|~-2fgQP1H+Olnnod0 z)!iXd<=;c1%Kt0<9ibS?_j|D$%~j6v_CRP#(;R+K_2LhHXr*fO@E@q^2BA<_p576j zf6b#3%npJ^{H>^Xao3teakM|$A4$qlSdTEME4zWvr(W+69{wPd-@EolL%;Qh@V~n_ z-robnA9Tx}!|#Eg!yg_cO9=L#zWodG+ba}mvIo?EdY^0IxsXJm)wqnmHR=B8eCgch zM``7p%afcnw$mJbeQ844{F_>!j64rS1p*#kif10a;Fvz)IHmcZkyDz5br_%4{j>A@ zOJI}~{{2kBqZ*{cI7(O4le)hH)u5KZMgyqWhTWoQBW-u-RTK5&N}?_n*BNw+a8g;C z{G20bWNS~Pk>?h}dH6yi$H`75h81QH-dXrqBH_tarK=@oCg>-28;z{%lOiz(A4&p8 zIFSh3BQi%giSK1f?7=mV}T35rvV+Czipk6xi(|zK5N0rZBzzR@T&Bdz#f~6cC}z3 zRafG%6LK=Xz9-wL2*hWVZSTPUCo33~UIiqwvy56a*a_a%tT_2NQ0r8hp_}&H?xL+hx!9f z6790I1M;K$q}2CgVK$(B8FKJ*RW|5TJ+_g_`6kPf@%dMNC*QW`7xt}KNwoFNe-j*9 zUq%x@{YR|*)~eOt#edrWS5G{Qh#<9rSfx~zK^@N{T8$oZb$fNB?}^7<>2`IvpiR>r z!RALUM+Xk8SXMn(WpH(hd1h_+8u3)ZOWC>>_#C#h<~$3&01QW8+gQq%Nel6tXA5{ABz_n|8BNvFd&3M?G=M@|~CXN;+fiX|` znXz{CynbV$(fR4e{vD;ZZykN1N9i}$^{MR*oFzv7c}ErNejgUy;$Gs?;>uFnO<{YJ zYsCkZY*G;7PG3VR-bHdR9>In{?So;4oxX7{eHmg4FUk5{IPLyG16D~ zi?23)R>z`;;RE2}uDe5ExV3Nug;H4$c2IjjcNHS^$9ynpb8JM}+3fjBiNn4AUF_T4 zE*;_3{>1VgGBD#WM}*AQbjvmffgt< z5_#J|Hw%7IT5YhpYJxHeB|nN;7fs=UbK#5qLXYik2OUQ4k(pasebc9bphnsE^x-j` zP(q?+toKgn8Gb-S()*SKYw;R5Gmwa6au$9+l$7~OIg$QG>Hsc_7*U$U#IsRmX`rN- zkb9&V=q)BCy|z*BvgdePcZo5#QqxzJdC^OXyVjuTv+5+Uaq z>of+GfphXxT$)Wb4DlJ*1V!~l(YQh_$#oF+0bka)&pgti8{Q3sEb_`FC%NcgK!aJ6 z56g$OO+M^YVeo7I3@v>>{B`?SnG>d@!hCpbY!)#5@>+6dOtp8AA)c!ro_}=Eo9E#Y z=v3v%miJXSXvL6psaI0aNDkI#m48v1d&sdu5<1QDa_NmcDn{lc4B{lkB&6ojKhdBl ztOVQLgk7bs)GC*uNy;>3sml#uuJD1WU~4!k*Td<$E8pubXxUqIkt#7DikOGx27101QYdTTzQYIk7-8 zFUg|~x{Vz=v@++|mZoyeSG{bX9+=6vCh<8M!p8~Uy10D_k%4_+JjwxU!}I~0LrE-6 zsrp?hes>VU(I7_G!9O3(+Ye|=IJ~RI`Fp%L;(MgXdw7Vfy^XB{y@8GW@0#%*HU7Ww zkawFE87n6fzGcrZK5UOrX|~I zNzirsx?Cc$WI0p<*cM9u-*T`&^np$gW;9XN6D-ube?iUBr~$_b*=9x4g(6Pe0{>p- z7y4CH7$Se=M=lsccn$z)1`7uak;jeJRBr;lZUmudt3lazVI?BqN%vm2yp*b#tYZEY zfBUX~R^q9QCrL^fMK+TuRn;11NWqgz%g8xPhtJA|i>0ytmhGhcY%|tlKs>Ot-mD!5_H? z)=t6494<+;lG}fNY;M(=ki!eP>_aH>X3-juvuBm+xyVzqVetVD-vxdn83+sb8p@q5 zVfrk1_0Ok$P-Vgc%kP@H{Jw~M_^YPs+1mbQ!v3eI?^^o)q$^0-U=x05S2^-gwTt>j zJ1SPrD7Rlcl4C{V7MD)RX^~gs>#J(#;}x>*8nK!=@}oUP&-Dsut=*~clYZ#RGHaSt zVL5-9X5Ojyc;?Zh3R`)&*orWgiQTv~#&X8}Jy1ip8?2Z?IYrPH3un>cD)y)&-;79P zdySuUbdF4ja2{tX%G7NxNA&_}_nx1$>^&t+u`Mf~lcV7Ixb~4{ z4GzCZQgrO6f!~|@u8^9UujeG|#CaK3CwN5agL?ulcK8LJ-GMxPt2u@U-cOjrR#!PL z{CmmIzbG@w>GZ2q_(xPq%^T1+jNJJp5%PhZ$hL!LNb5tlOWrdZ8;V7uN z!uBy-Z<>&hc-be;1%C^b>2SVtW83g>HKQqf&7IXdEEut1(x4jwqtL7-rvAt>ZAXy@ zgjXD>`xC@*@$4JN;PdGa^$pjvBBO?Mq2nX&r&Uwr0eTYeccuTAGZ+Y%_MJcd^G&>e z{rG>%f4G+?EAjUV{(e{JPbdJ8^*+b`a&zcc;IEe`{)BeEGmQVbQt>PN@6m~Wg5NI{ zzPq*mf3b?c*7R#&*`IZ>yz`uY3^V(+ieG~W{;VPbMHKgEI_^+pbf5H)Q{|*1! zS>UhuUw!(2;ynoeX#PJQ{$Ihr^6EdqEOh?{|H86=t>IT@_-74u?|k9+0sI?J{1yLq y`tT None: + ... + +def map(rating:str) -> int: + mapping = { + 'right' : 0, + 'left-center' : 1, + 'center' : 2, + 'left' : 3, + 'allsides' : 4, + 'right-center' : 5 + } + return mapping[rating] + + +@cli.command() +def load() -> None: + DB = connect() + DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR']) + f = str(DATA_DIR / "bias_ratings.csv") + + DB.sql(f""" + create table bias_ratings as + select + row_number() over(order by b.publisher) as id + ,b.* + from read_csv_auto('{f}') b + """) +@cli.command() +def join() -> None: + DB = connect() + + DB.sql(""" + with cte as ( + select + s.publisher + ,count(1) as stories + from stories s + group by s.publisher + ) + select + s.publisher + ,s.stories + ,b.publisher + ,b.bias + from bias_ratings b + join cte s + on s.publisher = b.publisher + order by + stories desc + limit 15 + """) + + DB.sql(""" + with cte as ( + select + s.publisher + ,count(1) as stories + from stories s + group by s.publisher + ) + select + sum(stories) + ,avg(agree / disagree) + from bias_ratings b + join cte s + on s.publisher = b.publisher + """) + + DB.sql(""" + with cte as ( + select + s.publisher + ,count(1) as stories + from stories s + group by s.publisher + ) + select + sum(s.stories) filter(where b.publisher is not null) as matched + ,sum(s.stories) filter(where b.publisher is null) as unmatched + ,cast(sum(s.stories) filter(where b.publisher is not null) as numeric) + / sum(s.stories) filter(where b.publisher is null) as precent_matched + from bias_ratings b + right join cte s + on s.publisher = b.publisher + """) + + DB.sql(""" + select + * + from bias_ratings + where publisher ilike '%CNN%' + """) + +@cli.command() +def debug() -> None: + DB = connect() + DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR']) + f = str(DATA_DIR / "bias_ratings.csv") + + DB.sql(""" + with cte as ( + select + outlet + ,count(1) as stories + from stories + group by outlet + ) + ,total as ( + select + sum(stories) as total + from cte + ) + select + cte.outlet + ,cte.stories + ,bias.outlet + ,bias.lean + ,sum(100 * (cte.stories / cast(total.total as float))) over() as rep + ,total.total + from cte + join bias + on jaro_winkler_similarity(bias.outlet, cte.outlet) > 0.9 + cross join total.total + """) + + DB.sql(""" + select + outlet + ,count(1) as stories + from stories + group by outlet + order by count(1) desc + limit 50 + """) + + outlets + +@cli.command() +def parse_html() -> None: + """parse the save html page of allslides.com bias ratings into a normalized csv file""" + DB = connect() + DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR']) + bias_html = DATA_DIR / 'allsides.html' + + parser = etree.HTMLParser() + tree = etree.parse(str(bias_html), parser) + root = tree.getroot() + rows = root.xpath('//table[contains(@class,"views-table")]/tbody/tr') + + ratings = [] + for row in rows: + rating = dict() + publisher = row.xpath('./td[contains(@class, "source-title")]/a')[0].text + rating['publisher'] = publisher + + bias = row.xpath('./td[contains(@class, "views-field-field-bias-image")]/a')[0].get('href') + bias = bias.split('/')[-1] + rating['bias'] = bias + + agree = row.xpath('.//span[contains(@class, "agree")]')[0].text + disagree = row.xpath('.//span[contains(@class, "disagree")]')[0].text + + rating['agree'] = int(agree) + rating['disagree'] = int(disagree) + ratings.append(rating) + df = pd.DataFrame(ratings) + df.to_csv(DATA_DIR / 'bias_ratings.csv', sep="|", index=False, quoting=csv.QUOTE_NONNUMERIC) + +if __name__ == "__main__": + cli() diff --git a/src/join_bias.py b/src/join_bias.py deleted file mode 100644 index a268f51..0000000 --- a/src/join_bias.py +++ /dev/null @@ -1,46 +0,0 @@ -import click -import duckdb -from data import connect -import polars as ps - -DB = connect() -DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR']) -bias = ps.read_csv(DATA_DIR / 'allsides_bias.csv', sep="|") - -DB.sql(""" - with cte as ( - select - outlet - ,count(1) as stories - from stories - group by outlet - ) - ,total as ( - select - sum(stories) as total - from cte - ) - select - cte.outlet - ,cte.stories - ,bias.outlet - ,bias.lean - ,sum(100 * (cte.stories / cast(total.total as float))) over() as rep - ,total.total - from cte - join bias - on jaro_winkler_similarity(bias.outlet, cte.outlet) > 0.9 - cross join total.total -""") - -DB.sql(""" - select - outlet - ,count(1) as stories - from stories - group by outlet - order by count(1) desc - limit 50 -""") - -outlets diff --git a/src/scrape.py b/src/scrape.py index 65d2c30..8950b25 100644 --- a/src/scrape.py +++ b/src/scrape.py @@ -59,27 +59,28 @@ def download(output_dir): @cli.command() -@click.option('-d', '--directory', type=Path, default=data_dir() / "memeorandum") -@click.option('-o', '--output_dir', type=Path, default=data_dir()) +@click.option('-d', '--directory', type=Path, default=data_dir() / "memeorandum", show_default=True) +@click.option('-o', '--output_dir', type=Path, default=data_dir(), show_default=True) def parse(directory, output_dir): + """parse the html files on disk into a structured csv format.""" directory = data_dir() / "memeorandum" parser = etree.HTMLParser() pages = [f for f in directory.glob("*.html")] published = [] others = [] - #page = pages[0] + # page = pages[0] page_iter = tqdm(pages, postfix="starting") for page in page_iter: page_iter.set_postfix_str(f"{page}") date = datetime.datetime.strptime(page.stem, '%y-%m-%d') - # tree = etree.parse(str(page), parser) tree = etree.parse(str(page), parser) root = tree.getroot() - if not root: + if root is None: print(f"error opening {page}") continue items = root.xpath("//div[contains(@class, 'item')]") + # item = items[0] for item in items: out = dict() citation = item.xpath('./cite') @@ -92,16 +93,24 @@ def parse(directory, output_dir): author = '' out['author'] = author try: - url = citation[0].getchildren()[0].get('href') + publisher_url = citation[0].getchildren()[0].get('href') publisher = citation[0].getchildren()[0].text except IndexError as e: print(f"error with citation url: {page}") out['publisher'] = publisher - out['publisher_url'] = url + out['publisher_url'] = publisher_url + title = item.xpath('.//strong/a')[0].text out['title'] = title - item_id = hash((title,page.stem,url)) + + url = item.xpath('.//strong/a')[0].get('href') + out['url'] = url + + item_id = hash((page.stem, url)) out['id'] = item_id + + old_id = hash((title, page.stem, publisher_url)) + out['old_id'] = old_id published.append(out) related = item.xpath(".//span[contains(@class, 'mls')]/a") @@ -113,9 +122,22 @@ def parse(directory, output_dir): another['parent_id'] = item_id others.append(another) df = pd.DataFrame(published) - df.to_csv(output_dir / 'stories.csv', sep='|', index=False) + df.to_csv(output_dir / 'stories_v2.csv', sep='|', index=False) df = pd.DataFrame(others) - df.to_csv(output_dir / 'related.csv', sep='|', index=False) + df.to_csv(output_dir / 'related_v2.csv', sep='|', index=False) + +@cli.command() +def normalize(): + DB = connect() + DB.sql(""" + create table publishers as + select + row_number() over(order by publisher) as id + ,publisher + ,publisher_url + from stories + group by publisher, publisher_url + """) if __name__ == "__main__": diff --git a/src/word.py b/src/word.py index ec36f91..93ba245 100644 --- a/src/word.py +++ b/src/word.py @@ -3,28 +3,49 @@ from scipy.spatial import distance from transformers import AutoTokenizer, RobertaModel import numpy as np from model import Model -from data import Data, from_db +from data import Data, from_db, connect @click.group() def cli(): ... +@cli.command() +def max_sequence(): + db = connect() + longest = db.sql(""" + select + title + from stories + order by length(title) desc + limit 5000 + """).df() + + tokenizer = AutoTokenizer.from_pretrained("roberta-base") + tokens = tokenizer(longest['title'].to_list()) + print(f"{max([len(x) for x in tokens['input_ids']])}") + @cli.command() def train(): table = from_db(Data.Titles) + + n_classes = 10 tokenizer = AutoTokenizer.from_pretrained("roberta-base") model = RobertaModel.from_pretrained("roberta-base") def get_embeddings(titles): # create tokens, padding to max width - tokens = tokenizer(titles, add_special_tokens = True, truncation = True, padding = "max_length", return_attention_mask = True, return_tensors = "pt") + tokens = tokenizer(titles, add_special_tokens = True, truncation = True, padding = "max_length", max_length=70, return_attention_mask = True, return_tensors = "pt") outputs = model(**tokens) return outputs.last_hidden_state[:, 0, :] titles = table['title'].apply(str).to_list()[:10] get_embeddings(titles) + outputs.last_hidden_state[0][200:] + outputs.values().shape + model + # linear = torch.nn.Linear(model.config.hidden_size, n_classes) # act = torch.nn.Sigmoid()