finish paper

2023-12-15 09:34:18 -08:00 · 2023-12-15 09:34:18 -08:00 · f069a37ca6
parent a21ed7a7d9
commit f069a37ca6
19 changed files with 547 additions and 315 deletions
--- a/30
+++ b/30
@ -1,11 +1,35 @@
-.PHONY:to_wwu
+PREFIX=jensen_577
+.PHONY: wwu cedar paper
+PAPER=acm_template

-all: to_wwu
+all: wwu

-to_wwu:
+paper: dist/$(PREFIX)_paper.pdf
+
+dist/$(PREFIX)_paper.pdf: docs/$(PAPER).pdf
+	mv $^ $@
+
+docs/$(PAPER).pdf: docs/Makefile docs/$(PAPER).tex
+	make -C docs $(PAPER).pdf
+
+hw: dist/$(PREFIX)_hw1.pdf
+
+wwu:
 	rsync -avz ~/577/repo/docs/figures/ linux-04:/home/jensen33/Dev/studentweb/assets/static/577/
 	scp ~/577/repo/docs/presentation.md linux-04:/home/jensen33/Dev/studentweb/content/577/contents.lr
 	scp ~/Dev/www.publicmatt.com/models/slides.ini linux-04:/home/jensen33/Dev/studentweb/models/
 	scp ~/Dev/www.publicmatt.com/templates/slides.html linux-04:/home/jensen33/Dev/studentweb/templates/
 	rsync -avz ~/Dev/www.publicmatt.com/assets/static/revealjs linux-04:/home/jensen33/Dev/studentweb/assets/static/
 	ssh linux-04 cd /home/jensen33/Dev/studentweb \; make
+
+cedar:
+	scp ~/577/repo/docs/presentation.md cedar:/home/user/www.publicmatt.com/content/577/contents.lr 
+	scp ~/Dev/www.publicmatt.com/templates/slides.html cedar:/home/user/www.publicmatt.com/templates/
+	rsync -avz ~/Dev/www.publicmatt.com/assets/static/revealjs cedar:/home/user/www.publicmatt.com/assets/static/
+	ssh cedar cd www.publicmatt.com \; make
+
+dist/$(PREFIX)_hw1.pdf: docs/hw1.md
+	pandoc $^ -o $@
+
+clean:
+	rm dist/$(PREFIX)_paper.pdf
--- a/dist/jensen_577_paper.pdf
+++ b/dist/jensen_577_paper.pdf
--- a/docs/comment.cut
+++ b/docs/comment.cut
@ -0,0 +1 @@
+To Dr. Hearne, for the instruction on clustering and classification techniques, and to Pax Newman for the discussion on word embeddings.
--- a/docs/data_mining_577.bib
+++ b/docs/data_mining_577.bib
@ -1,144 +1,204 @@
-
-@article{stewart_polarization_2020,
-	title = {Polarization under rising inequality and economic decline},
-	volume = {6},
-	issn = {2375-2548},
-	url = {https://www.science.org/doi/10.1126/sciadv.abd4201},
-	doi = {10.1126/sciadv.abd4201},
-	abstract = {Polarization can spread and become entrenched when inequality creates subpopulations that cannot afford risks.
-          , 
-            Social and political polarization is an important source of conflict in many societies. Understanding its causes has become a priority of scholars across disciplines. We demonstrate that shifts in socialization strategies analogous to political polarization can arise as a locally beneficial response to both rising wealth inequality and economic decline. In many contexts, interaction with diverse out-groups confers benefits from innovation and exploration greater than those that arise from interacting exclusively with a homogeneous in-group. However, when the economic environment favors risk aversion, a strategy of seeking lower-risk in-group interactions can be important to maintaining individual solvency. Our model shows that under conditions of economic decline or increasing inequality, some members of the population benefit from adopting a risk-averse, in-group favoring strategy. Moreover, we show that such in-group polarization can spread rapidly to the whole population and persist even when the conditions that produced it have reversed.},
-	language = {en},
-	number = {50},
-	urldate = {2023-05-16},
-	journal = {Science Advances},
-	author = {Stewart, Alexander J. and McCarty, Nolan and Bryson, Joanna J.},
-	month = dec,
-	year = {2020},
-	pages = {eabd4201},
-	file = {Stewart et al. - 2020 - Polarization under rising inequality and economic .pdf:/home/user/Zotero/storage/ZJXIIIBC/Stewart et al. - 2020 - Polarization under rising inequality and economic .pdf:application/pdf},
+@article{allcottPolarizationPublicHealth2020,
+  title = {Polarization and Public Health: {{Partisan}} Differences in Social Distancing during the Coronavirus Pandemic},
+  shorttitle = {Polarization and Public Health},
+  author = {Allcott, Hunt and Boxell, Levi and Conway, Jacob and Gentzkow, Matthew and Thaler, Michael and Yang, David},
+  year = {2020},
+  month = nov,
+  journal = {Journal of Public Economics},
+  volume = {191},
+  pages = {104254},
+  issn = {00472727},
+  doi = {10.1016/j.jpubeco.2020.104254},
+  urldate = {2023-06-06},
+  abstract = {We study partisan differences in Americans' response to the COVID-19 pandemic. Political leaders and media outlets on the right and left have sent divergent messages about the severity of the crisis, which could impact the extent to which Republicans and Democrats engage in social distancing and other efforts to reduce disease transmission. We develop a simple model of a pandemic response with heterogeneous agents that clarifies the causes and consequences of heterogeneous responses. We use location data from a large sample of smartphones to show that areas with more Republicans engaged in less social distancing, controlling for other factors including public policies, population density, and local COVID cases and deaths. We then present new survey evidence of significant gaps at the individual level between Republicans and Democrats in self-reported social distancing, beliefs about personal COVID risk, and beliefs about the future severity of the pandemic.},
+  langid = {english},
+  file = {/home/user/577/repo/docs/references/1-s2.0-S0047272720301183-main.pdf}
 }

-@article{prior_media_2013,
-	title = {Media and {Political} {Polarization}},
-	volume = {16},
-	issn = {1094-2939, 1545-1577},
-	url = {https://www.annualreviews.org/doi/10.1146/annurev-polisci-100711-135242},
-	doi = {10.1146/annurev-polisci-100711-135242},
-	abstract = {This article examines if the emergence of more partisan media has contributed to political polarization and led Americans to support more partisan policies and candidates. Congress and some newer media outlets have added more partisan messages to a continuing supply of mostly centrist news. Although political attitudes of most Americans have remained fairly moderate, evidence points to some polarization among the politically involved. Proliferation of media choices lowered the share of less interested, less partisan voters and thereby made elections more partisan. But evidence for a causal link between more partisan messages and changing attitudes or behaviors is mixed at best. Measurement problems hold back research on partisan selective exposure and its consequences. Ideologically one-sided news exposure may be largely conﬁned to a small, but highly involved and inﬂuential, segment of the population. There is no ﬁrm evidence that partisan media are making ordinary Americans more partisan.},
-	language = {en},
-	number = {1},
-	urldate = {2023-06-06},
-	journal = {Annual Review of Political Science},
-	author = {Prior, Markus},
-	month = may,
-	year = {2013},
-	pages = {101--127},
-	file = {Prior - 2013 - Media and Political Polarization.pdf:/home/user/Zotero/storage/SFKISRT9/Prior - 2013 - Media and Political Polarization.pdf:application/pdf},
+@article{allcottSocialMediaFake2017,
+  title = {Social {{Media}} and {{Fake News}} in the 2016 {{Election}}},
+  author = {Allcott, Hunt and Gentzkow, Matthew},
+  year = {2017},
+  month = may,
+  journal = {Journal of Economic Perspectives},
+  volume = {31},
+  number = {2},
+  pages = {211--236},
+  issn = {0895-3309},
+  doi = {10.1257/jep.31.2.211},
+  urldate = {2023-06-06},
+  abstract = {Following the 2016 US presidential election, many have expressed concern about the effects of false stories (``fake news''), circulated largely through social media. We discuss the economics of fake news and present new data on its consumption prior to the election. Drawing on web browsing data, archives of fact-checking websites, and results from a new online survey, we find: 1) social media was an important but not dominant source of election news, with 14 percent of Americans calling social media their ``most important'' source; 2) of the known false news stories that appeared in the three months before the election, those favoring Trump were shared a total of 30 million times on Facebook, while those favoring Clinton were shared 8 million times; 3) the average American adult saw on the order of one or perhaps several fake news stories in the months around the election, with just over half of those who recalled seeing them believing them; and 4) people are much more likely to believe stories that favor their preferred candidate, especially if they have ideologically segregated social media networks.},
+  langid = {english},
+  file = {/home/user/577/repo/docs/references/jep.31.2.211.pdf}
 }

-@article{allcott_social_2017,
-	title = {Social {Media} and {Fake} {News} in the 2016 {Election}},
-	volume = {31},
-	issn = {0895-3309},
-	url = {https://pubs.aeaweb.org/doi/10.1257/jep.31.2.211},
-	doi = {10.1257/jep.31.2.211},
-	abstract = {Following the 2016 US presidential election, many have expressed concern about the effects of false stories (“fake news”), circulated largely through social media. We discuss the economics of fake news and present new data on its consumption prior to the election. Drawing on web browsing data, archives of fact-checking websites, and results from a new online survey, we find: 1) social media was an important but not dominant source of election news, with 14 percent of Americans calling social media their “most important” source; 2) of the known false news stories that appeared in the three months before the election, those favoring Trump were shared a total of 30 million times on Facebook, while those favoring Clinton were shared 8 million times; 3) the average American adult saw on the order of one or perhaps several fake news stories in the months around the election, with just over half of those who recalled seeing them believing them; and 4) people are much more likely to believe stories that favor their preferred candidate, especially if they have ideologically segregated social media networks.},
-	language = {en},
-	number = {2},
-	urldate = {2023-06-06},
-	journal = {Journal of Economic Perspectives},
-	author = {Allcott, Hunt and Gentzkow, Matthew},
-	month = may,
-	year = {2017},
-	pages = {211--236},
-	file = {jep.31.2.211.pdf:/home/user/577/repo/docs/references/jep.31.2.211.pdf:application/pdf},
+@article{autorImportingPoliticalPolarization2020,
+  title = {Importing {{Political Polarization}}? {{The Electoral Consequences}} of {{Rising Trade Exposure}}},
+  shorttitle = {Importing {{Political Polarization}}?},
+  author = {Autor, David and Dorn, David and Hanson, Gordon and Majlesi, Kaveh},
+  year = {2020},
+  month = oct,
+  journal = {American Economic Review},
+  volume = {110},
+  number = {10},
+  pages = {3139--3183},
+  issn = {0002-8282},
+  doi = {10.1257/aer.20170011},
+  urldate = {2023-06-06},
+  abstract = {Has rising import competition contributed to the polarization of US politics? Analyzing multiple measures of political expression and results of congressional and presidential elections spanning the period 2000 through 2016, we find strong though not definitive evidence of an ideological realignment in trade-exposed local labor markets that commences prior to the divisive 2016 US presidential election. Exploiting the exogenous component of rising import competition by China, we find that trade exposed electoral districts simultaneously exhibit growing ideological polarization in some domains, meaning expanding support for both strong-left and strong-right views, and pure rightward shifts in others. Specifically, trade-impacted commuting zones or districts saw an increasing market share for the Fox News channel (a rightward shift), stronger ideological polarization in campaign contributions (a polarized shift), and a relative rise in the likelihood of electing a Republican to Congress (a rightward shift). Trade-exposed counties with an initial majority White population became more likely to elect a GOP conservative, while trade-exposed counties with an initial majority-minority population became more likely to elect a liberal Democrat, where in both sets of counties, these gains came at the expense of moderate Democrats (a polarized shift). In presidential elections, counties with greater trade exposure shifted toward the Republican candidate (a rightward shift). These results broadly support an emerging political economy literature that connects adverse economic shocks to sharp ideological realignments that cleave along racial and ethnic lines and induce discrete shifts in political preferences and economic policy. (JEL D72, F14, J15, L82, R23)},
+  langid = {english},
+  file = {/home/user/577/repo/docs/references/w22637.pdf}
 }

-@article{allcott_polarization_2020,
-	title = {Polarization and public health: {Partisan} differences in social distancing during the coronavirus pandemic},
-	volume = {191},
-	issn = {00472727},
-	shorttitle = {Polarization and public health},
-	url = {https://linkinghub.elsevier.com/retrieve/pii/S0047272720301183},
-	doi = {10.1016/j.jpubeco.2020.104254},
-	abstract = {We study partisan differences in Americans' response to the COVID-19 pandemic. Political leaders and media outlets on the right and left have sent divergent messages about the severity of the crisis, which could impact the extent to which Republicans and Democrats engage in social distancing and other efforts to reduce disease transmission. We develop a simple model of a pandemic response with heterogeneous agents that clariﬁes the causes and consequences of heterogeneous responses. We use location data from a large sample of smartphones to show that areas with more Republicans engaged in less social distancing, controlling for other factors including public policies, population density, and local COVID cases and deaths. We then present new survey evidence of signiﬁcant gaps at the individual level between Republicans and Democrats in self-reported social distancing, beliefs about personal COVID risk, and beliefs about the future severity of the pandemic.},
-	language = {en},
-	urldate = {2023-06-06},
-	journal = {Journal of Public Economics},
-	author = {Allcott, Hunt and Boxell, Levi and Conway, Jacob and Gentzkow, Matthew and Thaler, Michael and Yang, David},
-	month = nov,
-	year = {2020},
-	pages = {104254},
-	file = {1-s2.0-S0047272720301183-main.pdf:/home/user/577/repo/docs/references/1-s2.0-S0047272720301183-main.pdf:application/pdf},
+@misc{demszkyGoEmotionsDatasetFineGrained2020,
+  title = {{{GoEmotions}}: {{A Dataset}} of {{Fine-Grained Emotions}}},
+  shorttitle = {{{GoEmotions}}},
+  author = {Demszky, Dorottya and {Movshovitz-Attias}, Dana and Ko, Jeongwoo and Cowen, Alan and Nemade, Gaurav and Ravi, Sujith},
+  year = {2020},
+  month = jun,
+  number = {arXiv:2005.00547},
+  eprint = {2005.00547},
+  primaryclass = {cs},
+  publisher = {{arXiv}},
+  urldate = {2023-06-09},
+  abstract = {Understanding emotion expressed in language has a wide range of applications, from building empathetic chatbots to detecting harmful online behavior. Advancement in this area can be improved using large-scale datasets with a fine-grained typology, adaptable to multiple downstream tasks. We introduce GoEmotions, the largest manually annotated dataset of 58k English Reddit comments, labeled for 27 emotion categories or Neutral. We demonstrate the high quality of the annotations via Principal Preserved Component Analysis. We conduct transfer learning experiments with existing emotion benchmarks to show that our dataset generalizes well to other domains and different emotion taxonomies. Our BERT-based model achieves an average F1-score of .46 across our proposed taxonomy, leaving much room for improvement.},
+  archiveprefix = {arxiv},
+  langid = {english},
+  keywords = {Computer Science - Computation and Language},
+  file = {/home/user/577/repo/docs/references/2005.00547.pdf}
 }

-@article{flaxman_filter_2016,
-	title = {Filter {Bubbles}, {Echo} {Chambers}, and {Online} {News} {Consumption}},
-	volume = {80},
-	issn = {0033-362X, 1537-5331},
-	url = {https://academic.oup.com/poq/article-lookup/doi/10.1093/poq/nfw006},
-	doi = {10.1093/poq/nfw006},
-	abstract = {Online publishing, social networks, and web search have dramatically lowered the costs of producing, distributing, and discovering news articles. Some scholars argue that such technological changes increase exposure to diverse perspectives, while others worry that they increase ideological segregation. We address the issue by examining webbrowsing histories for 50,000 US-located users who regularly read online news. We find that social networks and search engines are associated with an increase in the mean ideological distance between individuals. However, somewhat counterintuitively, these same channels also are associated with an increase in an individual’s exposure to material from his or her less preferred side of the political spectrum. Finally, the vast majority of online news consumption is accounted for by individuals simply visiting the home pages of their favorite, typically mainstream, news outlets, tempering the consequences—both positive and negative—of recent technological changes. We thus uncover evidence for both sides of the debate, while also finding that the magnitude of the effects is relatively modest.},
-	language = {en},
-	number = {S1},
-	urldate = {2023-06-06},
-	journal = {Public Opinion Quarterly},
-	author = {Flaxman, Seth and Goel, Sharad and Rao, Justin M.},
-	year = {2016},
-	pages = {298--320},
-	file = {bubbles.pdf:/home/user/577/repo/docs/references/bubbles.pdf:application/pdf},
+@article{duboisEchoChamberOverstated2018,
+  title = {The Echo Chamber Is Overstated: The Moderating Effect of Political Interest and Diverse Media},
+  shorttitle = {The Echo Chamber Is Overstated},
+  author = {Dubois, Elizabeth and Blank, Grant},
+  year = {2018},
+  month = may,
+  journal = {Information, Communication \& Society},
+  volume = {21},
+  number = {5},
+  pages = {729--745},
+  issn = {1369-118X, 1468-4462},
+  doi = {10.1080/1369118X.2018.1428656},
+  urldate = {2023-06-09},
+  abstract = {In a high-choice media environment, there are fears that individuals will select media and content that reinforce their existing beliefs and lead to segregation based on interest and/or partisanship. This could lead to partisan echo chambers among those who are politically interested and could contribute to a growing gap in knowledge between those who are politically interested and those who are not. However, the high-choice environment also allows individuals, including those who are politically interested, to consume a wide variety of media, which could lead them to more diverse content and perspectives. This study examines the relationship between political interest as well as media diversity and being caught in an echo chamber (measured by five different variables). Using a nationally representative survey of adult internet users in the United Kingdom (N = 2000), we find that those who are interested in politics and those with diverse media diets tend to avoid echo chambers. This work challenges the impact of echo chambers and tempers fears of partisan segregation since only a small segment of the population are likely to find themselves in an echo chamber. We argue that single media studies and studies which use narrow definitions and measurements of being in an echo chamber are flawed because they do not test the theory in the realistic context of a multiple media environment.},
+  langid = {english},
+  file = {/home/user/577/repo/docs/references/The echo chamber is overstated the moderating effect of political interest and diverse media.pdf}
 }

-@article{guess_almost_2021,
-	title = {({Almost}) {Everything} in {Moderation}: {New} {Evidence} on {Americans}' {Online} {Media} {Diets}},
-	volume = {65},
-	issn = {0092-5853, 1540-5907},
-	shorttitle = {({Almost}) {Everything} in {Moderation}},
-	url = {https://onlinelibrary.wiley.com/doi/10.1111/ajps.12589},
-	doi = {10.1111/ajps.12589},
-	abstract = {Does the internet facilitate selective exposure to politically congenial content? To answer this question, I introduce and validate large-N behavioral data on Americans’ online media consumption in both 2015 and 2016. I then construct a simple measure of media diet slant and use machine classification to identify individual articles related to news about politics. I find that most people across the political spectrum have relatively moderate media diets, about a quarter of which consist of mainstream news websites and portals. Quantifying the similarity of Democrats’ and Republicans’ media diets, I find nearly 65\% overlap in the two groups’ distributions in 2015 and roughly 50\% in 2016. An exception to this picture is a small group of partisans who drive a disproportionate amount of traffic to ideologically slanted websites. If online “echo chambers” exist, they are a reality for relatively few people who may nonetheless exert disproportionate influence and visibility.},
-	language = {en},
-	number = {4},
-	urldate = {2023-06-06},
-	journal = {American Journal of Political Science},
-	author = {Guess, Andrew M.},
-	month = oct,
-	year = {2021},
-	pages = {1007--1022},
-	file = {guess2021.pdf:/home/user/577/repo/docs/references/guess2021.pdf:application/pdf},
+@article{flaxmanFilterBubblesEcho2016,
+  title = {Filter {{Bubbles}}, {{Echo Chambers}}, and {{Online News Consumption}}},
+  author = {Flaxman, Seth and Goel, Sharad and Rao, Justin M.},
+  year = {2016},
+  journal = {Public Opinion Quarterly},
+  volume = {80},
+  number = {S1},
+  pages = {298--320},
+  issn = {0033-362X, 1537-5331},
+  doi = {10.1093/poq/nfw006},
+  urldate = {2023-06-06},
+  abstract = {Online publishing, social networks, and web search have dramatically lowered the costs of producing, distributing, and discovering news articles. Some scholars argue that such technological changes increase exposure to diverse perspectives, while others worry that they increase ideological segregation. We address the issue by examining webbrowsing histories for 50,000 US-located users who regularly read online news. We find that social networks and search engines are associated with an increase in the mean ideological distance between individuals. However, somewhat counterintuitively, these same channels also are associated with an increase in an individual's exposure to material from his or her less preferred side of the political spectrum. Finally, the vast majority of online news consumption is accounted for by individuals simply visiting the home pages of their favorite, typically mainstream, news outlets, tempering the consequences\textemdash both positive and negative\textemdash of recent technological changes. We thus uncover evidence for both sides of the debate, while also finding that the magnitude of the effects is relatively modest.},
+  langid = {english},
+  file = {/home/user/577/repo/docs/references/bubbles.pdf}
 }

-@article{autor_importing_2020,
-	title = {Importing {Political} {Polarization}? {The} {Electoral} {Consequences} of {Rising} {Trade} {Exposure}},
-	volume = {110},
-	issn = {0002-8282},
-	shorttitle = {Importing {Political} {Polarization}?},
-	url = {https://pubs.aeaweb.org/doi/10.1257/aer.20170011},
-	doi = {10.1257/aer.20170011},
-	abstract = {Has rising import competition contributed to the polarization of US politics? Analyzing multiple measures of political expression and results of congressional and presidential elections spanning the period 2000 through 2016, we find strong though not definitive evidence of an ideological realignment in trade-exposed local labor markets that commences prior to the divisive 2016 US presidential election. Exploiting the exogenous component of rising import competition by China, we find that trade exposed electoral districts simultaneously exhibit growing ideological polarization in some domains, meaning expanding support for both strong-left and strong-right views, and pure rightward shifts in others. Specifically, trade-impacted commuting zones or districts saw an increasing market share for the Fox News channel (a rightward shift), stronger ideological polarization in campaign contributions (a polarized shift), and a relative rise in the likelihood of electing a Republican to Congress (a rightward shift). Trade-exposed counties with an initial majority White population became more likely to elect a GOP conservative, while trade-exposed counties with an initial majority-minority population became more likely to elect a liberal Democrat, where in both sets of counties, these gains came at the expense of moderate Democrats (a polarized shift). In presidential elections, counties with greater trade exposure shifted toward the Republican candidate (a rightward shift). These results broadly support an emerging political economy literature that connects adverse economic shocks to sharp ideological realignments that cleave along racial and ethnic lines and induce discrete shifts in political preferences and economic policy. (JEL D72, F14, J15, L82, R23)},
-	language = {en},
-	number = {10},
-	urldate = {2023-06-06},
-	journal = {American Economic Review},
-	author = {Autor, David and Dorn, David and Hanson, Gordon and Majlesi, Kaveh},
-	month = oct,
-	year = {2020},
-	pages = {3139--3183},
-	file = {w22637.pdf:/home/user/577/repo/docs/references/w22637.pdf:application/pdf},
+@article{grahamMoralStereotypesLiberals2012,
+  title = {The {{Moral Stereotypes}} of {{Liberals}} and {{Conservatives}}: {{Exaggeration}} of {{Differences}} across the {{Political Spectrum}}},
+  shorttitle = {The {{Moral Stereotypes}} of {{Liberals}} and {{Conservatives}}},
+  author = {Graham, Jesse and Nosek, Brian A. and Haidt, Jonathan},
+  editor = {Young, Liane},
+  year = {2012},
+  month = dec,
+  journal = {PLoS ONE},
+  volume = {7},
+  number = {12},
+  pages = {e50092},
+  issn = {1932-6203},
+  doi = {10.1371/journal.pone.0050092},
+  urldate = {2023-06-09},
+  abstract = {We investigated the moral stereotypes political liberals and conservatives have of themselves and each other. In reality, liberals endorse the individual-focused moral concerns of compassion and fairness more than conservatives do, and conservatives endorse the group-focused moral concerns of ingroup loyalty, respect for authorities and traditions, and physical/spiritual purity more than liberals do. 2,212 U.S. participants filled out the Moral Foundations Questionnaire with their own answers, or as a typical liberal or conservative would answer. Across the political spectrum, moral stereotypes about ``typical'' liberals and conservatives correctly reflected the direction of actual differences in foundation endorsement but exaggerated the magnitude of these differences. Contrary to common theories of stereotyping, the moral stereotypes were not simple underestimations of the political outgroup's morality. Both liberals and conservatives exaggerated the ideological extremity of moral concerns for the ingroup as well as the outgroup. Liberals were least accurate about both groups.},
+  langid = {english},
+  file = {/home/user/577/repo/docs/references/graham2012.pdf}
 }

-@misc{jensen_data_2023,
-	title = {Data {Mining} 577: {Political} {Polarization} {Source} {Code}},
-	url = {https://github.com/publicmatt/data_mining_577},
-	publisher = {https://github.com/publicmatt/data\_mining\_577},
-	author = {Jensen, Matt},
-	year = {2023},
+@article{guessAlmostEverythingModeration2021,
+  title = {({{Almost}}) {{Everything}} in {{Moderation}}: {{New Evidence}} on {{Americans}}' {{Online Media Diets}}},
+  shorttitle = {({{Almost}}) {{Everything}} in {{Moderation}}},
+  author = {Guess, Andrew M.},
+  year = {2021},
+  month = oct,
+  journal = {American Journal of Political Science},
+  volume = {65},
+  number = {4},
+  pages = {1007--1022},
+  issn = {0092-5853, 1540-5907},
+  doi = {10.1111/ajps.12589},
+  urldate = {2023-06-06},
+  abstract = {Does the internet facilitate selective exposure to politically congenial content? To answer this question, I introduce and validate large-N behavioral data on Americans' online media consumption in both 2015 and 2016. I then construct a simple measure of media diet slant and use machine classification to identify individual articles related to news about politics. I find that most people across the political spectrum have relatively moderate media diets, about a quarter of which consist of mainstream news websites and portals. Quantifying the similarity of Democrats' and Republicans' media diets, I find nearly 65\% overlap in the two groups' distributions in 2015 and roughly 50\% in 2016. An exception to this picture is a small group of partisans who drive a disproportionate amount of traffic to ideologically slanted websites. If online ``echo chambers'' exist, they are a reality for relatively few people who may nonetheless exert disproportionate influence and visibility.},
+  langid = {english},
+  file = {/home/user/577/repo/docs/references/guess2021.pdf}
 }

-@misc{jensen_data_2023-1,
-	title = {Data {Mining} 577: {Political} {Polarization} {Data}},
-	url = {https://data.publicmatt.com/national_news/stories},
-	author = {Jensen, Matt},
-	year = {2023},
+@misc{jensenDataMining5772023,
+  title = {Data {{Mining}} 577: {{Political Polarization Source Code}}},
+  author = {Jensen, Matt},
+  year = {2023},
+  publisher = {{https://github.com/publicmatt/data\_mining\_577}}
+}
+
+@misc{jensenDataMining5772023a,
+  title = {Data {{Mining}} 577: {{Political Polarization Data}}},
+  author = {Jensen, Matt},
+  year = {2023}
+}
+
+@article{priorMediaPoliticalPolarization2013,
+  title = {Media and {{Political Polarization}}},
+  author = {Prior, Markus},
+  year = {2013},
+  month = may,
+  journal = {Annual Review of Political Science},
+  volume = {16},
+  number = {1},
+  pages = {101--127},
+  issn = {1094-2939, 1545-1577},
+  doi = {10.1146/annurev-polisci-100711-135242},
+  urldate = {2023-06-06},
+  abstract = {This article examines if the emergence of more partisan media has contributed to political polarization and led Americans to support more partisan policies and candidates. Congress and some newer media outlets have added more partisan messages to a continuing supply of mostly centrist news. Although political attitudes of most Americans have remained fairly moderate, evidence points to some polarization among the politically involved. Proliferation of media choices lowered the share of less interested, less partisan voters and thereby made elections more partisan. But evidence for a causal link between more partisan messages and changing attitudes or behaviors is mixed at best. Measurement problems hold back research on partisan selective exposure and its consequences. Ideologically one-sided news exposure may be largely confined to a small, but highly involved and influential, segment of the population. There is no firm evidence that partisan media are making ordinary Americans more partisan.},
+  langid = {english},
+  file = {/home/user/Zotero/storage/SFKISRT9/Prior - 2013 - Media and Political Polarization.pdf}
+}
+
+@article{stewartPolarizationRisingInequality2020,
+  title = {Polarization under Rising Inequality and Economic Decline},
+  author = {Stewart, Alexander J. and McCarty, Nolan and Bryson, Joanna J.},
+  year = {2020},
+  month = dec,
+  journal = {Science Advances},
+  volume = {6},
+  number = {50},
+  pages = {eabd4201},
+  issn = {2375-2548},
+  doi = {10.1126/sciadv.abd4201},
+  urldate = {2023-05-16},
+  abstract = {Polarization can spread and become entrenched when inequality creates subpopulations that cannot afford risks.           ,              Social and political polarization is an important source of conflict in many societies. Understanding its causes has become a priority of scholars across disciplines. We demonstrate that shifts in socialization strategies analogous to political polarization can arise as a locally beneficial response to both rising wealth inequality and economic decline. In many contexts, interaction with diverse out-groups confers benefits from innovation and exploration greater than those that arise from interacting exclusively with a homogeneous in-group. However, when the economic environment favors risk aversion, a strategy of seeking lower-risk in-group interactions can be important to maintaining individual solvency. Our model shows that under conditions of economic decline or increasing inequality, some members of the population benefit from adopting a risk-averse, in-group favoring strategy. Moreover, we show that such in-group polarization can spread rapidly to the whole population and persist even when the conditions that produced it have reversed.},
+  langid = {english},
+  file = {/home/user/Zotero/storage/ZJXIIIBC/Stewart et al. - 2020 - Polarization under rising inequality and economic .pdf}
+}
+
+@inproceedings{zhuMultidimensionalPoliticalSpectrum2009,
+  title = {Multidimensional Political Spectrum Identification and Analysis},
+  booktitle = {Proceedings of the 18th {{ACM}} Conference on {{Information}} and Knowledge Management},
+  author = {Zhu, Leilei and Mitra, Prasenjit},
+  year = {2009},
+  month = nov,
+  pages = {2045--2048},
+  publisher = {{ACM}},
+  address = {{Hong Kong China}},
+  doi = {10.1145/1645953.1646297},
+  urldate = {2023-06-09},
+  abstract = {In this work, we show the importance of multidimensional opinion representation in the political context combining domain knowledge and results from principal component analysis. We discuss the differences of feature selection between political spectrum analysis and normal opinion mining tasks. We build regression models on each opinion dimension for scoring and placing new opinion entities, e.g. personal blogs or politicians, onto the political opinion spectrum. We apply our methods on the floor statement records of the United States Senate and evaluate it against the uni-dimensional representation of political opinion space. The experimental results show the effectiveness of the proposed model in explaining the voting records of the Senate.},
+  isbn = {978-1-60558-512-3},
+  langid = {english},
+  file = {/home/user/577/repo/docs/references/zhu2009.pdf}
 }
--- a/docs/figures/articles_per_bias_per_year.png
+++ b/docs/figures/articles_per_bias_per_year.png
--- a/docs/figures/link_cluster_elbow.png
+++ b/docs/figures/link_cluster_elbow.png
--- a/docs/figures/link_confusion.png
+++ b/docs/figures/link_confusion.png
--- a/docs/figures/link_pca_clusters_links.png
+++ b/docs/figures/link_pca_clusters_links.png
--- a/docs/figures/link_pca_clusters_normalized.png
+++ b/docs/figures/link_pca_clusters_normalized.png
--- a/docs/figures/link_pca_clusters_onehot.png
+++ b/docs/figures/link_pca_clusters_onehot.png
--- a/docs/goals.md
+++ b/docs/goals.md
@ -0,0 +1,3 @@
+# Time from election vs. Neutrality frequency.
+
+# Cluster based on average of embeddings.
--- a/docs/paper.tex
+++ b/docs/paper.tex
@ -19,15 +19,14 @@
 \renewcommand{\shortauthors}{Jensen, et al.}

 \begin{abstract}
-Political polarization in the United States has increased in recent years according to studies \cite{stewart_polarization_2020}.
-A number of polling methods and data sources have been used to track this phenomenon  \cite{prior_media_2013}.
-A casual link between polarization and partisanship in elections and the community has been hard to establish.
-One possible cause is the media diet of the average American.
-In particular, the medium of consumption has shifted online and the range of sources has widened considerably.
-In an effort to quantify the range of online media, a study of online news article headlines was conducted.
-It found that titles with emotionally neutral wording have decreased in the share of all articles over time.
-A model was built to classify titles using BERT-style word embeddings and a simple classifier.
-
+    Political polarization in the United States has increased in recent years according to studies \cite{stewartPolarizationRisingInequality2020}.
+    A number of polling methods and data sources have been used to track this phenomenon  \cite{priorMediaPoliticalPolarization2013}.
+    A casual link between polarization and partisanship in elections and the community has been hard to establish.
+    One possible cause is the media diet of the average American.
+    In particular, the medium of consumption has shifted online and the range of sources has widened considerably.
+    In an effort to quantify the range of online media, a study of online news article headlines was conducted.
+    It found that titles with emotionally neutral wording have decreased in the share of all articles over time.
+    A model was built to classify titles using BERT-style word embeddings and a simple classifier.
 \end{abstract}

 \keywords{data mining, datasets, classification, clustering, neural networks}
@ -39,162 +38,267 @@ A model was built to classify titles using BERT-style word embeddings and a simp

 \section{Background}

-Media and new publishers have been accused of polarizing discussion to drive up revenue and engagement.
-This paper seeks to quantify those claims by classifying the degree to which news headlines have become more emotionally charged of time.
-A secondary goal is the investigate whether news organization have been uniformly polarized, or if one pole has been 'moving' more rapidly away from the 'middle'.
-This analysis will probe to what degree has the \href{https://en.wikipedia.org/wiki/Overton_window}{Overton Window} has shifted in the media.
-Naom Chomsky had a hypothesis about manufactured consent that is beyond the scope of this paper, so we will restrict our analysis to the presence of agenda instead of the cause of it.
+    There is evidence of increased political polarization in the United States over the past 16 years.
+    Through voting patterns and self-reported political viewpoints and party affiliation, the political landscape has seen a 'hallowing out' of the middle in resent years.
+    A common categorization of political leanings can be summerized as the spectrum from 
+    In simple terms, political beliefs in the United States can be categoried as left, center or right.
+    This political spectrum largely reflects affiliation with one of the two dominate political parties on the federal level, left associated with Democrats and right associated with Republicans.
+    To appeal to difference segments of this political affiliation, publishers of news might want to target content and coverage to issues relevant to only one side of the spectrum or the other.
+    This phenomenon has led to concern over the creation of echo chambers where each side only consumes content made specifically to confirm their own beliefs.
+    Driven by the market demand for confirmation bias, media and new publishers have been accused of polarizing discussion to drive up revenue and engagement.
+    This paper seeks to quantify those claims by classifying the degree to which news headlines have become more emotionally charged of time.
+    A secondary goal is the investigate whether news organization have been uniformly polarized, or if one side of the spectrum has been 'moving' more rapidly away from the 'middle'.

-There is evidence supporting and increase in political polarization in the United States over the past 16 years.
-There have been a number of studies conducted in an attempt to measure and explain this phenomenon.  \cite{flaxman_filter_2016}
-
-These studies attempt to link increased media options and a decrease in the proportion of less engaged and less partisan voters. 
-This drop in less engaged voters might explain the increased partisanship in elections.
-However, the evidence regarding a direct causal relationship between partisan media messages and changes in attitudes or behaviors is inconclusive.
-Directly measuring the casual relationship between media messages and behavior is difficult.
-There is currently no solid evidence to support the claim that partisan media outlets are causing average Americans to become more partisan.
-
-The number of media publishers has increased and in this particular data set:
-
-These studies rest on the assumption that media outlets are becoming more partisan.
-We study this assumption in detail.
-
-Party Sorting: Over the past few decades, there has been a significant increase in party sorting, where Democrats have become more ideologically liberal, and Republicans have become more ideologically conservative.
-This trend indicates a growing gap between the two major political parties.
-A study published in the journal American Political Science Review in 2018 found that party sorting increased significantly between 2004 and 2016.
-
-Congressional Polarization: There has been a substantial increase in polarization among members of the U.S. Congress. Studies analyzing voting patterns and ideological positions of legislators have consistently shown a widening gap between Democrats and Republicans.
-The Pew Research Center reported that the median Democrat and the median Republican in Congress have become further apart ideologically between 2004 and 2017.
-
-Public Opinion: Surveys and polls also provide evidence of increasing political polarization among the American public.
-According to a study conducted by Pew Research Center in 2017, the gap between Republicans and Democrats on key policy issues, such as immigration, the environment, and social issues, has widened significantly since 1994.
-
-Media Fragmentation: The rise of social media and digital media platforms has contributed to the fragmentation of media consumption, leading to the creation of ideological echo chambers.
-Individuals are more likely to consume news and information that aligns with their pre-existing beliefs, reinforcing and intensifying polarization.
-
-    Increased Negative Attitudes: Studies have shown that Americans' attitudes towards members of the opposing political party have become increasingly negative. The Pew Research Center reported in 2016 that negative feelings towards the opposing party have doubled since the late 1990s, indicating a deepening divide.
-
- Memeorandum: **stories**
- AllSides: **bias**
- HuggingFace: **sentiment**
- ChatGPT: **election dates**
 \section{Data Sources}
+    All data was collected over the course of 2023 using python scripts, the source code for which is available on GitHub \cite{jensenDataMining5772023}.

-All data was collected over the course of 2023.
-
-\begin{table}
-    \label{tab:freq}
-    \caption{News Dataset Sources}
-    \label{tab:1}
-    \begin{tabular}{ll}
-        \toprule
-        Source     & Description \\
-        \midrule
-        Memeorandum & News aggregation service.     \\
-        AllSides    & Bias evaluator.   \\
-        MediaBiasFactCheck   & Bias evaluator.   \\
-        HuggingFace & Classification model repository. \\
-        \bottomrule
-    \end{tabular}
-\end{table}
-
-\section{Data Preparation}
+    \begin{table}
+        \label{tab:sources}
+        \caption{News Dataset Sources}
+        \begin{tabular}{ll}
+            \toprule
+            Source     & Description \\
+            \midrule
+            Memeorandum & News aggregation service.     \\
+            AllSides    & Bias evaluator.   \\
+            MediaBiasFactCheck   & Bias evaluator.   \\
+            HuggingFace & Classification model repository. \\
+            \bottomrule
+        \end{tabular}
+    \end{table}

 \subsection{Memeorandum}
-The subject of analysis is a set of news article headlines scraped from the news aggregation site \href{https://mememorandum.com}{Memeorandum} for news stories from 2006 to 2022.
-Each news article has a title, author, description, publisher, publish date and url. 
-All of these are non-numeric, except for the publication date which is ordinal.
-The site also has a concept of references, where a main, popular story may be covered by other sources.
-Using an archive of the website, each day's headlines were downloaded and parsed using python, then normalized and stored in sqlite database tables \cite{jensen_data_2023-1}.
+    The main subject of analysis is a set of news article headlines downloaded from the news aggregation site \href{https://mememorandum.com}{Memeorandum}.
+    The archive spans the years 2005 to 2023 and contains headlines from over 1,700 unique publishers \ref{tab:base-stats}.
+    Each news article has a title, author, description, publisher, publish date and url. 
+    All of these are non-numeric, except for the publication date which is ordinal.
+    The site also has a concept of references, where a main, popular story may be covered by other sources.
+    Using an archive of the website, each day's headlines were downloaded and parsed using python, then normalized and stored in sqlite database tables \cite{jensenDataMining5772023a}.

-\subsection{AllSides\\MediaBiasFactCheck}
+    \begin{table}
+        \caption{News Dataset Statistics After Cleaning}
+        \label{tab:base-stats}
+        \begin{tabular}{ll}
+            \toprule
+            stat       & value     \\
+            \midrule
+            publishers & 1,735     \\
+            stories    & 242,343   \\
+            authors    & 34,346    \\
+            children   & 808,628   \\
+            date range & 2006-2022 \\
+            \bottomrule
+        \end{tabular}
+    \end{table}

+\subsection{AllSides \& MediaBiasFactCheck}
+    The media bias ratings are sourced with permission from two media watchdog groups, \href{https://www.allsides.com}{AllSides} and \href{https://mediabiasfactcheck.com/}{MediaBiasFactCheck}.
+    These sources aggregate expert opinion, crowdsourced data and 
+    Each source's objective is to assess the bias and factual reporting of media and information sources.
+    Their claim to achieve this using methodologies that combine objective measures like use of primary sources, consistency of reporting accuracy and expert and crowdsourced opinion.
+    Neither claim it realistic to maintain perfect objectivity, and openly admit determining bias is challenging.
+    Both sources provide a categorical value of bias from left to right.
+    It is important to note the bias scale is based on the political landscape of the United States, which differs from that of other countries.
+    For instance, while the Democrats are regarded as centrist or right-center in many nations, they are considered left-center within the US.
+    This bias rating can be convered to a zero centered quantitative measure, with center bias representing zero and left and right representing -2 and 2 respectively.
+    In addition to bias, MediaBiasFactCheck provides a measure of trustworthiness of the publishers' source material.
+    That measure is not used in this analysis, but could be an interesting feature to include in classification analysis in the future.

+\subsection{Word Embeddings \& Sentiment}

-What remains after cleaning is approximately 240,000 headlines from 1,700 publishers, 34,000 authors over about 64,000 days \ref{tab:1}.
-
-\begin{table}
-    \label{tab:freq}
-    \caption{News Dataset Statistics After Cleaning}
-    \label{tab:1}
-    \begin{tabular}{ll}
-        \toprule
-        stat       & value     \\
-        \midrule
-        publishers & 1,735     \\
-        stories    & 242,343   \\
-        authors    & 34,346    \\
-        children   & 808,628   \\
-        date range & 2006-2022 \\
-        \bottomrule
-    \end{tabular}
-\end{table}
+    A common operation in natural language processing is to take text, which is not quantitative, and transform it into a vector representation.
+    There are a couple of common ways to do to.
+    Historically, an algorithm like term frequency–inverse document frequency (TFIDF) was used.
+    Term frequency counts the number of times a word appears within a document relative to the size of the document.
+    Inverse document frequency measures how common a word appears in the corpus.
+    The product of these two values combines the local importance of a term  with its global importance  across the corpus.
+    In contrast, bidirectional encoder representations from transformers (BERT) use deep learning to capture contextual meaning of words.
+    It learns an embedding by training on a large amount of text data.
+    For the task of sentiment, the text data is labeled by humans with a value from -1, meaning negative, to 1, meaning positive.
+    In this way, the BERT model can be a sophisticated classifier of tokenized text.
+    The labeling of news titles with emotional and sentiment categories was accomplished by using a pre-trained large language model from \href{https://huggingface.co/arpanghoshal/EmoRoBERTa}{HuggingFace}.
+    The emotional component of this model was trained on a dataset curated and published by Google\cite{demszkyGoEmotionsDatasetFineGrained2020} which manually classified a collection of 58,000 comments into 28 emotions.
+    The classes for each article will be derived by tokenizing the title and running the model over the tokens, then grabbing the largest probability class from the output.

 \subsection{Missing Data Policy}

-The only news headlines used in this study were those with an associated bias rating from either AllSides or MediaBiasFactCheck.
-This elimiated about 5300 publishers and 50,000 headlines, which are outlets publishing only less than 1 story per year.
-Another consideration was the relationship between the opinion and news sections of organizations.
-MediaBiasFactCheck makes a distinct between things like the Wall Street Journal's news organization, one it rates as 'Least Bias', and Wall Street Journal's opinion organization, one it rates as 'Right'.
-Due to the nature of the Memeorandum dataset, and the way that organizations design their url structure, this study was not able to parse the headlines into news, opinion, blogs or other sub-categories recognized by the bias datasets.
-As such, news and opinion was combined under the same bias rating, and the rating with the most articles published was taken as the default value.
-This might lead to organizations with large newsrooms to bias toward the center in the dataset.
+    The only news headlines used in this study were those with an associated bias rating from either AllSides or MediaBiasFactCheck.
+    This elimiated about 5300 publishers and 50,000 headlines, which are outlets publishing only less than 1 story per year.
+    Another consideration was the relationship between the opinion and news sections of organizations.
+    MediaBiasFactCheck makes a distinct between things like the Wall Street Journal's news organization, one it rates as 'Least Bias', and Wall Street Journal's opinion organization, one it rates as 'Right'.
+    Due to the nature of the Memeorandum dataset, and the way that organizations design their url structure, this study was not able to parse the headlines into news, opinion, blogs or other sub-categories recognized by the bias datasets.
+    As such, news and opinion was combined under the same bias rating, and the rating with the most articles published was taken as the default value.
+    This might lead to organizations with large newsrooms to bias toward the center in the dataset.
+    What remains after cleaning is approximately 240,000 headlines from 1,700 publishers, 34,000 authors over about 64,000 days \ref{tab:base-stats}.

+    \begin{figure}[h]
+        \centering
+        \includegraphics[width=\linewidth]{figures/articles_per_bias_per_year.png}
+        \caption{Articles per bias over time.}
+        \Description{descriptive statistics on the news data source}
+    \end{figure}

 \section{Experiments}

 \subsection{Link Similarity Clustering and Classification}
+    The links between breaking news and coverage of that news by other sources can be thought of as a short and wide tree data structure.
+    Or more generally, each headline and linking article are nodes in a directed graph with a single edge pointing from the coverage to the headline.
+    The stories were already in an adjacency list representation, so it was easy to convert it to a adjacency matrix.
+    This matrix then had a row for every publisher with at least one breaking news item, and a column for all publishers in the dataset.
+    The value of the edge from child to parent took on the form of one of three options: onehot encoding, total references and normalized references.
+    The one hot encoding scheme was the simpliest: if a link exists between child and parent, put a one, otherwise, put a zero.
+    The total references scheme was similar, but the references were summed, so each cell contained the sum of all links between parent and child.
+    The normalized scheme extended the total references scheme by dividing each cell by the sum of references across each row, so that each row would sum to one.
+    The result was three matricies, with 
+    The creation and reduction of the link graph with principle component analysis will need to be done to visualize the relationship between related publishers.

 \subsection{Title Sentiment Classification}

+
+Of the features used in the analysis, there are enough data points that null or missing values can safely be excluded.
+The bias ratings do not cover all publisher in the dataset, but there are enough remaining labels to make classification an interesting task.
+
 for every title, tokenize, classify.

-The classification of news titles into emotional categories was accomplished by using a pre-trained large language model from \href{https://huggingface.co/arpanghoshal/EmoRoBERTa}{HuggingFace}.
-This model was trained on \href{https://ai.googleblog.com/2021/10/goemotions-dataset-for-fine-grained.html}{a dataset curated and published by Google} which manually classified a collection of 58,000 comments into 28 emotions.
-The classes for each article will be derived by tokenizing the title and running the model over the tokens, then grabbing the largest probability class from the output.

-The data has been discretized into years.
+    The data has been discretized into years.
    Additionally, the publishers will have been discretized based of either principle component analysis on link similarity or based on the bias ratings of \href{https://www.allsides.com/media-bias/ratings}{All Sides}.
-Given that the features of the dataset are sparse, it is not expected to have any useless attributes, unless the original hypothesis of a temporal trend proving to be false.
-Of the features used in the analysis, there are enough data points that null or missing values can safely be excluded.
+    Given that the features of the dataset are sparse, it is not expected to have any useless attributes, unless the original hypothesis of a temporal trend proving to be false.

-No computational experiment have been done yet.
-Generating the tokenized text, the word embedding and the emotional sentiment analysis have made up the bulk of the work thus far.
-The bias ratings do not cover all publisher in the dataset, so the number of articles without a bias rating from their publisher will have to be calculated.
-If it is less than 30\% of the articles, it might not make sense to use the bias ratings.
-The creation and reduction of the link graph with principle component analysis will need to be done to visualize the relationship between related publishers.


 \section{Results}

+\subsection{Link Similarity Clustering and Classification}
+
+\paragraph{Elbow Method}
+To determine the optimal number of clusters to use for analysis of the link similarity experiments, an plot of squared distances between centroids vs. bin size $k$ was used.
+Commonly called an 'elbow plot', it helps find a point of diminishing returns, where adding more clusters does not significantly improve the quality of clustering.
+This analysis heuristically reveals the optimal number of clusters to be in the range of 4 to 7 \ref{fig:elbow}.
+
 \begin{figure}[h]
    \centering
-    \includegraphics[width=\linewidth]{figures/articles_per_year.png}
-    \caption{Articles per year.}
-    \Description{descriptive statistics on the news data source}
+    \includegraphics[width=\linewidth]{figures/link_cluster_elbow.png}
+    \caption{Elbow Criterion for KMeans}
+    \label{fig:elbow}
 \end{figure}

+The idea behind the elbow plot is to find a balance between having too few or too many clusters.
+Too few results in under-segmentation, while too many clusters may lead to over-segmentation and less interpretable results.
+
+\paragraph{Clustering}
+
+All three encoding schemes (onehot, links quantity, normalized) were ran through a KMeans clustering with $k=5$.
+To visualize the results, a principle component analysis (PCA) was performed on the input.
+PCA is a technique for dimensionality reduction and data visualization.
+It transforms correlated variables into a smaller set of uncorrelated variables, retaining as much information as possible from the original data.
+The first two principal components represent the most important patterns or structures in the data.
+Plotting these two components against the predicted labels of the KMeans clustering algorithm 
+gives us a good gauge on the accuracy of the clustering \ref{fig:cluster-onehot} \ref{fig:cluster-normalized} \ref{fig:cluster-links}.
+
+Clustering on link quantity leads to most publishers being labeled with a single class \ref{fig:cluster-links}.
+
+\begin{figure}
+    \centering
+    \includegraphics[width=\linewidth]{figures/link_pca_clusters_links.png}
+    \caption{PCA components vs. KMeans Clusters (Links)}
+    \label{fig:cluster-links}
+\end{figure}
+
+The problem of all publishers being labeled a single label is lessed when the link quantity is normalized \ref{fig:cluster-normalized}.
+It was initially thought that the quantity of the links, or the frequency of stories published, would dominate the clustering and the PCA analysis, but less prolific publishers are present in the smaller classes instead.
+
+\begin{figure}[h]
+    \centering
+    \includegraphics[width=\linewidth]{figures/link_pca_clusters_normalized.png}
+    \caption{PCA components vs. KMeans Clusters (Normalized)}
+    \label{fig:cluster-normalized}
+\end{figure}
+
+The classes are more evenly distributed when a one hot encoding is used to generate the KMean clustering \ref{fig:cluster-normalized}.
+
+\begin{figure}[h]
+    \centering
+    \includegraphics[width=\linewidth]{figures/link_pca_clusters_onehot.png}
+    \caption{PCA components vs. KMeans Clusters (One Hot)}
+    \label{fig:cluster-onehot}
+\end{figure}
+
+
+\paragraph{Classification} 
+A k-neighest neighbors (kNN) classification algorithm was trained on the adjacency matrix, the one hot encoding was used for all the following experiements.
+To test the classification accuracy, the link dataset was separated into a training and a test set with a ratio of $80:20$ respectively.
+
+To visualize the results, a confusion matrix was generated, plotting the true values of the hold out data against what the kNN model predicts \ref{fig:link-confusion}.
+
+\begin{figure}[h]
+    \centering
+    \includegraphics[width=\linewidth]{figures/link_confusion.png}
+    \caption{kNN confusion matrix of related links adjacency matrix}
+    \label{fig:link-confusion}
+    \Description{}
+\end{figure}
+
+Generally, higher numbers along the major diagonal means, where the correct label is the same as the predicted label, is most accurate.
+Additionally, the distribution of values is relatively balanced, indicating the model is performing consistently across different classes without much skew toward one particular classification.
+
+Overall, the linking between publishers around the same headlines serves as a good feature set for predicting the bias class of a particular publisher.
+A model such as the kNN, trained on this data, could be used to classify unseen publishers based on where they get their sources from.
+
+\subsection{Title Sentiment Classification}
+
+Tracking sentiment over time was very straight forward once classes were extracted from the pre-trained model \ref{fig:sentiment-time}.
+In general, the trend was toward the sentiment of news headlines to decrease from positive to negative over time.
+The only point over the extend of this dataset where the sentiment was positive ($> 0.5$), for any publication on the political spectrum, was short periods around 2008 and 2011.
+
 \begin{figure}[h]
    \centering
    \includegraphics[width=\linewidth]{figures/bias_vs_sentiment_over_time.png}
    \caption{Sentiment vs. bias over time}
    \Description{Timeseries classifcation of news titles sentiment and bias}
+    \label{fig:sentiment-time}
 \end{figure}

+\paragraph{Emotion}
+In addition to sentiment, ranging from 0 to 1, another BERT based classifier was used to extract the emotional tone of the headlines.
+The dominate emotion by far was a neutral one \ref{fig:emotion-time}
+
 \begin{figure}[h]
    \centering
-    \includegraphics[width=\linewidth]{figures/link_pca_clusters_onehot.png}
-    \caption{kNN confusion matrix of related links adjacency matrix}
-    \Description{}
+    \includegraphics[width=\linewidth]{figures/emotion_over_time.png}
+    \caption{Emotion Tone Frequency Over Time}
+    \label{fig:emotion-time}
+\end{figure}
+
+In general, the emotional tone of headlines have not seen a significant amount of change over time.
+A regression model was fit to the average emotional content per week.
+The extreme ends of the political spectrum actually saw the highest increase in neutral titles overtime according to the slope of the regression model \ref{fig:emotion-reg}.
+The publishers labeled as the least bias saw the highest decrease in neutral emotion expressed.
+
+
+\begin{figure}[h]
+    \centering
+    \includegraphics[width=\linewidth]{figures/emotion_regression.png}
+    \caption{Slope of Regression (stores/year) by Bias and Emotion}
+    \label{fig:emotion-reg}
 \end{figure}


-% \section{Math Equations}
+\section{Discussion}
+We performed link similarity clustering using three encoding schemes: one-hot encoding, total references, and normalized references.
+We applied KMeans clustering and visualized the results using PCA.
+We found one-hot encoding scheme led to more evenly distributed clusters, while the link quantity scheme resulted in most publishers being labeled with a single class.
+The normalized scheme reduced the issue of all publishers being labeled with a single class but not enough to use in subsequent studies.

-% \begin{equation}
-%   \sum_{i=0}^{\infty}x_i=\int_{0}^{\pi+2} f
-% \end{equation}
+For classification, a kNN model was trained on the link adjacency matrix using the one-hot encoding scheme.
+The accuracy of the classification was evaluated using a confusion matrix, which showed consistent performance across different classes.

+We used a pre-trained BERT model to classify the sentiment and emotionaly content of news headlines.
+We observed a trend towards decreasing sentiment from positive to negative over time.
+They found that the dominant emotion was neutral throughout the dataset.
+
+Overall, the results of the study indicate a polarization in news headlines over time, with a shift towards more negative sentiment.
+This study explores the impact of political polarization on the emotional characteristics of news headlines and the link similarity of publication with the same political bias.

 \begin{acks}
 To Dr. Hearne, for the instruction on clustering and classification techniques, and to Pax Newman for the discussion on word embeddings.
@ -205,9 +309,5 @@ To Dr. Hearne, for the instruction on clustering and classification techniques,

 \appendix

-\section{Online Resources}
-
-The source code for the study is available on GitHub \cite{jensen_data_2023}.
-
 \end{document}
 \endinput
--- a/docs/progress_spec_4.docx
+++ b/docs/progress_spec_4.docx
--- a/docs/remaining_work.md
+++ b/docs/remaining_work.md
@ -0,0 +1,19 @@
+
+<!-- .slide: class="center" -->
+
+# Remaining Work
+
+==
+
+# Experiment 6 (**TODO**)
+
+## Setup
+
+- Have a lot of features now. <!-- .element: class="fragment" -->
+    - Link PCA components.
+    - Embedding PCA components.
+    - Sentiment.
+    - Emotion.
+- Can we predict with all of them: Bias. <!-- .element: class="fragment" -->
+
+==
--- a/docs/solicitation.md
+++ b/docs/solicitation.md
@ -0,0 +1,11 @@
+
+
+```
+Hi! I'm a Masters in CS at Western Washington University taking a Data Mining course (CS 577). We have to pick a dataset and do an analysis on it. I want to use your bias labels to augment a dataset I've built using story titles from memeorandum.com. I want to do a classification task and a cluster task based on the text embeddings of the titles. I'm curious to see if the the Overton Window has shifted over time. I'm also curious to quantify along the billionaire/independent axis which titles are associated with which outlets.
+
+It's academic use only, but the results (not the raw data) will probably make their way onto my website (publicmatt.com). 
+
+Let me know if you have any other questions (or good research questions???).
+
+Thanks! 
+```
--- a/src/data/factcheck.py
+++ b/src/data/factcheck.py
@ -8,7 +8,7 @@ from pathlib import Path
 import os
 import sys
 import click
-from data.main import connect, map_tld, paths, reporting_label_to_int
+from data.main import connect, map_tld, paths, reporting_label_to_int, bias_label_to_int
 from random import randint
 from time import sleep
 from tqdm import tqdm
@ -128,6 +128,8 @@ def create_tables():
    df['tld'] = df.tld.apply(map_tld)
    df['ordinal'] = df.bias.apply(bias_label_to_int)

+    df.tld
+
    with connect() as db:
        db.sql("""
            CREATE OR REPLACE TABLE mbfc.publishers AS
--- a/src/data/links.py
+++ b/src/data/links.py
@ -1,6 +1,8 @@
 import click
-from data.main import connect
+from data.main import connect, paths, ticklabels
 import pandas as pd
+from sklearn.decomposition import PCA
+from sklearn.cluster import KMeans

@click.command('links:create-table')
 def create_table():
@ -53,7 +55,6 @@ def create_table():
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
 def create_pca(source):
    """create 2D pca labels"""
-    from sklearn.decomposition import PCA

    table_name = f"publisher_pca_{source}"

@ -62,8 +63,6 @@ def create_pca(source):
            SELECT
                p.*
            FROM mbfc.publishers p
-            JOIN mbfc.publisher_stories ps
-            ON p.id = ps.publisher_id
        """).df()
        df = db.query(f"""
            SELECT
@ -98,9 +97,10 @@ def create_pca(source):
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
 def create_clusters(source):
    """create link adj. matrix clusters table"""
-    from sklearn.cluster import KMeans

+    source = 'links'
    table_name = f"publisher_clusters_{source}"
+
    with connect() as db:
        df = db.query(f"""
            SELECT
@ -113,17 +113,18 @@ def create_clusters(source):
            SELECT
                p.*
            FROM mbfc.publishers p
-            JOIN mbfc.publisher_stories ps
-            ON ps.publisher_id = p.id
        """).df()
+
+
    pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
-    k = 8
+    k = 5
    kmeans = KMeans(n_clusters=k, n_init="auto")
    pred = kmeans.fit_predict(pivot)
    out = pivot.reset_index()[['parent_id']]
    out['label'] = pred
    out = pd.merge(out, pub, left_on='parent_id', right_on='id')
    new_table = out[['id', 'label']]
+
    with connect() as db:
        db.query(f"""
            CREATE OR REPLACE TABLE {table_name} AS
@ -132,4 +133,5 @@ def create_clusters(source):
                ,n.label as label
            FROM new_table n
        """)
+
    print(f"created {table_name}")
--- a/src/plots/descriptive.py
+++ b/src/plots/descriptive.py
@ -11,7 +11,7 @@ def articles_per_year():
    save_to = paths('figures') / 'articles_per_year.png'

    with connect() as db:
-        data = DB.query("""
+        data = db.query("""
            select
                year(published_at) as year
                ,count(1) as stories
@ -27,6 +27,40 @@ def articles_per_year():
    plt.savefig(save_to)
    print(f"saved: {save_to}")

+@click.command('descriptive:articles-per-bias-per-year')
+def articles_per_bias_per_year():
+
+    save_to = paths('figures') / 'articles_per_bias_per_year.png'
+
+    with connect() as db:
+        data = db.query("""
+            select
+                date_trunc('year', s.published_at) as year
+                ,p.bias
+                ,count(1) as stories
+            from stories s
+            join mbfc.publisher_stories ps
+            on ps.story_id = s.id
+            join mbfc.publishers p
+            on p.id = ps.publisher_id
+            where year(s.published_at) not in (2005, 2023)
+            and p.bias ilike 'L%'
+            or p.bias ilike 'R%'
+            group by 
+                date_trunc('year', s.published_at)
+                ,p.bias
+            order by mode(p.ordinal)
+        """).df()
+
+    fig, ax = plt.subplots(figsize=(5, 5))
+    sns.lineplot(x=data.year, y=data.stories, hue=data.bias, ax=ax, palette='rainbow')
+    ax.tick_params(axis='x', rotation=90)
+    ax.set(ylabel="count of stories (#)")
+    plt.legend(loc='upper right')
+    plt.tight_layout()
+    plt.savefig(save_to)
+    print(f"saved: {save_to}")
+
@click.command('descriptive:distinct-publishers')
 def distinct_publishers():
    save_to = paths('figures') / 'distinct_publishers.png'
--- a/src/plots/links.py
+++ b/src/plots/links.py
@ -3,13 +3,14 @@ from data.main import connect, ticklabels, paths
 import seaborn as sns
 import matplotlib.pyplot as plt
 import numpy as np
-from sklearn.metrics import silhouette_score
 import pandas as pd
-
+from sklearn.cluster import KMeans
+from sklearn.model_selection import train_test_split
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.metrics import ConfusionMatrixDisplay, silhouette_score

@click.command('links:elbow')
 def elbow():
-    from sklearn.cluster import KMeans

    save_to = paths('figures') / 'link_cluster_elbow.png'

@ -32,7 +33,7 @@ def elbow():
    to_plot = pd.DataFrame(to_plot)

    ax = sns.lineplot(x=to_plot.k, y=to_plot.inertia)
-    ax.set(title="elbow criterion plot of clusters", xlabel="bin size (k)", ylabel="sum of squared distances between centroids/points")
+    ax.set(xlabel="bin size (k)", ylabel="sum of squared distances between centroids/points")
    plt.savefig(save_to)
    plt.close()
    print(f"saved plot: {save_to}")
@ -43,40 +44,31 @@ def elbow():
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
 def link_pca_clusters(source):

+    source = 'onehot'
    save_to = paths('figures') / f"link_pca_clusters_{source}.png"

    with connect() as db:
        df = db.query(f"""
            SELECT
-                c.label as cluster
-                ,p.tld
-                --,b.label as bias
-                ,pca.first
+                pca.first
                ,pca.second
-                ,s.cnt as stories
-            FROM publisher_clusters_{source} c
-            JOIN mbfc.publisher_stories ps
-            ON ps.publisher_id = c.publisher_id
-            JOIN mbfc.publishers p
-            ON ps.publisher_id = p.id
-            JOIN 
-            (
-                select
-                    p.id as publisher_id
-                    ,count(1) as cnt
-                FROM mbfc.publishers p
-                GROUP BY
-                    p.id
-            ) s
-            ON s.publisher_id = p.id
+                ,pca.publisher_id
+                ,p.ordinal as bias
+                ,p.name
+                ,clusters.label as cluster
+                ,count(1) over() as cnt
+            FROM mbfc.publishers p
            JOIN publisher_pca_{source} pca
-            ON pca.publisher_id = p.id
+            ON p.id = pca.publisher_id
+            JOIN publisher_clusters_{source} clusters
+            ON p.id = clusters.publisher_id
        """).df()

-    ax = sns.scatterplot(x=df['first'], y=df['second'], hue=df['cluster'])
+    ax = sns.scatterplot(df, x='first', y='second', hue='cluster')
    ax.set(title=f"pca components vs. clusters ({source})", xlabel="first pca component", ylabel="second pca component")
    plt.savefig(save_to)
    print(f"saved plot: {save_to}")
+    plt.close()


 def test():
@ -108,9 +100,6 @@ def test():

@click.command('links:confusion')
 def link_confusion():
-    from sklearn.model_selection import train_test_split
-    from sklearn.neighbors import KNeighborsClassifier
-    from sklearn.metrics import ConfusionMatrixDisplay

    filename = "link_confusion.png"
    save_to = paths('figures') / filename
@ -119,28 +108,13 @@ def link_confusion():
        bias = db.query("""
            SELECT
                p.id as publisher_id
-                ,b.ordinal
-            FROM top.publishers p
-            JOIN top.publisher_bias pb
-            ON pb.publisher_id = p.id
-            JOIN bias_ratings b
-            ON b.id = pb.bias_id
+                ,p.ordinal
+            FROM mbfc.publishers p
        """).df()
-
        df = db.query("""
            SELECT
                *
-            FROM top.link_edges
-            WHERE parent_id in (
-                select
-                    publisher_id
-                from bias
-            )
-            AND child_id in (
-                select
-                    publisher_id
-                from bias
-            )
+            FROM link_edges
        """).df()

    pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
@ -167,9 +141,6 @@ def link_confusion():

@click.command('links:classifier')
 def link_confusion():
-    from sklearn.model_selection import train_test_split
-    from sklearn.neighbors import KNeighborsClassifier
-    from sklearn.metrics import ConfusionMatrixDisplay

    save_to = paths('figures') / "link_confusion.png"

@ -204,15 +175,20 @@ def link_confusion():
    x = publisher_matrix.loc[:, ~publisher_matrix.columns.isin(['publisher_id', 'ordinal'])].values
    y = publisher_matrix['ordinal']

+    x_train, x_test = train_test_split(x)
+    y_train, y_test = train_test_split(y)
+
    model = KNeighborsClassifier(n_neighbors=5)
-    model.fit(x, y)
-    y_pred = model.predict(x)
+    model.fit(x_train, y_train)
+    y_pred = model.predict(x_test)
+
    publisher_matrix['pred'] = y_pred
    publisher_matrix


    fig, ax = plt.subplots(figsize=(5, 5))
-    ConfusionMatrixDisplay.from_predictions(publisher_matrix['ordinal'], publisher_matrix['pred'], ax=ax)
+    ConfusionMatrixDisplay.from_predictions(y_test, y_pred, ax=ax)
+    ax.legend().remove()
    ax.set(xticklabels=ticklabels(), yticklabels=ticklabels())
    plt.xticks(rotation=45)
    plt.tight_layout()
				`@ -0,0 +1 @@`
				`To Dr. Hearne, for the instruction on clustering and classification techniques, and to Pax Newman for the discussion on word embeddings.`