diff --git a/.gitignore b/.gitignore
index d7d7bff..0c518ae 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,8 @@
*.swp
__pycache__
tmp.py
+.env
+*.aux
+*.log
+*.out
+tmp.*
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..58f2d1f
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,11 @@
+.PHONY:to_wwu
+
+all: to_wwu
+
+to_wwu:
+ rsync -avz ~/577/repo/docs/figures/ linux-04:/home/jensen33/Dev/studentweb/assets/static/577/
+ scp ~/577/repo/docs/presentation.md linux-04:/home/jensen33/Dev/studentweb/content/577/contents.lr
+ scp ~/Dev/www.publicmatt.com/models/slides.ini linux-04:/home/jensen33/Dev/studentweb/models/
+ scp ~/Dev/www.publicmatt.com/templates/slides.html linux-04:/home/jensen33/Dev/studentweb/templates/
+ rsync -avz ~/Dev/www.publicmatt.com/assets/static/revealjs linux-04:/home/jensen33/Dev/studentweb/assets/static/
+ ssh linux-04 cd /home/jensen33/Dev/studentweb \; make
diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 0000000..a63e81e
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,3 @@
+paper.pdf: paper.tex
+ pdflatex $^ -o $@
+ evince $@
diff --git a/docs/figures/allsides_request.png b/docs/figures/allsides_request.png
new file mode 100644
index 0000000..920b104
Binary files /dev/null and b/docs/figures/allsides_request.png differ
diff --git a/docs/figures/articles_per_year.png b/docs/figures/articles_per_year.png
new file mode 100644
index 0000000..7a41821
Binary files /dev/null and b/docs/figures/articles_per_year.png differ
diff --git a/docs/figures/common_tld.png b/docs/figures/common_tld.png
new file mode 100644
index 0000000..e98b03f
Binary files /dev/null and b/docs/figures/common_tld.png differ
diff --git a/docs/figures/distinct_publishers.png b/docs/figures/distinct_publishers.png
new file mode 100644
index 0000000..e7113e2
Binary files /dev/null and b/docs/figures/distinct_publishers.png differ
diff --git a/docs/figures/link_cluster_elbow.png b/docs/figures/link_cluster_elbow.png
new file mode 100644
index 0000000..2764351
Binary files /dev/null and b/docs/figures/link_cluster_elbow.png differ
diff --git a/docs/figures/link_pca_clusters_links.png b/docs/figures/link_pca_clusters_links.png
new file mode 100644
index 0000000..edd7229
Binary files /dev/null and b/docs/figures/link_pca_clusters_links.png differ
diff --git a/docs/figures/link_pca_clusters_normalized.png b/docs/figures/link_pca_clusters_normalized.png
new file mode 100644
index 0000000..da0cc82
Binary files /dev/null and b/docs/figures/link_pca_clusters_normalized.png differ
diff --git a/docs/figures/link_pca_clusters_onehot.png b/docs/figures/link_pca_clusters_onehot.png
new file mode 100644
index 0000000..94941c0
Binary files /dev/null and b/docs/figures/link_pca_clusters_onehot.png differ
diff --git a/docs/figures/pca_with_classes.png b/docs/figures/pca_with_classes.png
new file mode 100644
index 0000000..a0362ef
Binary files /dev/null and b/docs/figures/pca_with_classes.png differ
diff --git a/docs/figures/stories_per_publisher.png b/docs/figures/stories_per_publisher.png
new file mode 100644
index 0000000..f63d983
Binary files /dev/null and b/docs/figures/stories_per_publisher.png differ
diff --git a/docs/figures/top_publishers.png b/docs/figures/top_publishers.png
new file mode 100644
index 0000000..8961cb7
Binary files /dev/null and b/docs/figures/top_publishers.png differ
diff --git a/docs/paper.pdf b/docs/paper.pdf
new file mode 100644
index 0000000..d374c95
Binary files /dev/null and b/docs/paper.pdf differ
diff --git a/docs/paper.tex b/docs/paper.tex
new file mode 100644
index 0000000..9e5af5d
--- /dev/null
+++ b/docs/paper.tex
@@ -0,0 +1,61 @@
+\documentclass{article}
+\usepackage{multicol}
+\usepackage{hyperref}
+\title{Data Mining CS 571}
+\author{Matt Jensen}
+\date{2023-04-25}
+
+\begin{document}
+\maketitle
+
+\section*{Abstract}
+
+News organizations have been repeatedly accused of being partisan.
+Additionally, they have been accused of polarizing dicussion to drive up revenue and engagement.
+This paper seeks to quantify those claims by classifying the degree to which news headlines have become more emotionally charged of time.
+A secondary goal is the investigate whether news organization have been uniformly polarized, or if one pole has been 'moving' more rapidly away from the 'middle'.
+This analysis will probe to what degree has the \href{https://en.wikipedia.org/wiki/Overton_window}{Overton Window} has shifted in the media.
+Naom Chomsky had a hypothesis about manufactured consent that is beyond the scope of this paper, so we will restrict our analysis to the presence of agenda instead of the cause of it.
+
+\begin{multicols}{2}
+
+\section{Data Preparation}
+The subject of analysis is a set of news article headlines scraped from the news aggregation site \href{https://mememorandum.com}{Memeorandum} for news stories from 2006 to 2022.
+Each news article has a title, author, description, publisher, publish date, url and related discussions.
+The site also has a concept of references, where a main, popular story may be covered by other sources.
+This link association might be used to support one or more of the hypothesis of the main analysis.
+After scraping the site, the data will need to be deduplicated and normalized to minimize storage costs and processing errors.
+What remains after these cleaning steps is approximitely 6,400 days of material, 300,000 distinct headlines from 21,000 publishers and 34,000 authors used in the study.
+
+\section{Missing Data Policy}
+
+The largest data policy that will have to be dealt with is news organizations that share the same parent company, but might have slightly different names.
+Wall Street Journal news is drastically different than their opinion section.
+Other organizations have slightly different names for the same thing and a product of the aggregation service and not due to any real difference.
+Luckily, most of the anaylsis is operating on the content of the news headlines, which do not suffer from this data impurity.
+
+\section{Classification Task}
+
+The classification of news titles into emotional categories was accomplished by using a pretrained large langauge model from \href{https://huggingface.co/arpanghoshal/EmoRoBERTa}{HuggingFace}.
+This model was trained on \href{https://ai.googleblog.com/2021/10/goemotions-dataset-for-fine-grained.html}{a dataset curated and published by Google} which manually classified a collection of 58,000 comments into 28 emotions.
+The classes for each article will be derived by tokenizing the title and running the model over the tokens, then grabbing the largest probabilty class from the output.
+
+The data has been discretized into years.
+ Additionally, the publishers will have been discretized based of either principle component analysis on link similarity or based on the bias ratings of \href{https://www.allsides.com/media-bias/ratings}{All Sides}.
+Given that the features of the dataset are sparse, it is not expected to have any useless attributes, unless the original hypothesis of a temporal trend proving to be false.
+Of the features used in the analysis, there are enough data points that null or missing values can safely be excluded.
+
+\section{Experiments}
+
+No computational experiment have been done yet.
+Generating the tokenized text, the word embedding and the emotional sentiment analysis have made up the bulk of the work thus far.
+The bias ratings do not cover all publisher in the dataset, so the number of articles without a bias rating from their publisher will have to be calculated.
+If it is less than 30\% of the articles, it might not make sense to use the bias ratings.
+The creation and reduction of the link graph with principle component analysis will need to be done to visualize the relationship between related publishers.
+
+\section{Results}
+\textbf{TODO.}
+
+\end{multicols}
+
+\end{document}
diff --git a/docs/presentation.md b/docs/presentation.md
new file mode 100644
index 0000000..c6ad8f1
--- /dev/null
+++ b/docs/presentation.md
@@ -0,0 +1,552 @@
+_model: slides
+---
+
+title: CSCI 577 - Data Mining
+
+---
+body:
+
+# Political Polarization
+
+Matt Jensen
+
+===
+
+# Hypothesis
+
+Political polarization is rising, and news articles are a proxy measure.
+
+==
+
+# Is this reasonable?
+
+
+==
+
+# Why is polarization rising?
+
+Not my job, but there's research[ref](#references) to support it
+
+
+==
+
+# Sub-hypothesis
+
+- The polarization increases near elections.
+- The polarization is not evenly distributed across publishers.
+- The polarization is not evenly distributed across political specturm.
+
+==
+
+# Sub-sub-hypothesis
+
+- Similarly polarized publishers link to each other.
+- 'Mainstream' media uses more neutral titles.
+- Highly polarized publications don't last as long.
+
+===
+
+# Data Source(s)
+
+memeorandum.com
+
+allsides.com
+
+huggingface.com
+
+===
+
+
+
+===
+
+# memeorandum.com
+
+- News aggregation site.
+- Was really famous before Google News.
+- Still aggregates sites today.
+
+==
+
+# Why Memeorandum?
+
+- Behavioral: I only read titles sometimes. (doom scrolling).
+- Behavioral: It's my source of news (with sister site TechMeme.com).
+- Convenient: most publishers block bots.
+- Convenient: dead simple html to parse.
+- Archival: all headlines from 2006 forward.
+- Archival: automated, not editorialized.
+
+===
+
+
+
+===
+
+# AllSides.com
+
+- Rates news publications as left, center or right.
+- Ratings combine:
+ - blind bias surveys.
+ - editorial reviews.
+ - third party research.
+ - community voting.
+- Originally scraped website, but direct access eventually.
+
+
+==
+
+# Why AllSides?
+
+- Behavioral: One of the first google results on bias apis.
+- Convenient: Ordinal ratings [-2: very left, 2: very right].
+- Convenient: Easy format.
+- Archival: Covers 1400 publishers.
+
+===
+
+
+
+===
+
+# HuggingFace.com
+
+- Deep Learning library.
+- Lots of pretrained models.
+- Easy, off the shelf word/sentence embeddings and text classification models.
+
+==
+
+# Why HuggingFace?
+
+- Behavioral: Language Models are HOT right now.
+- Behavioral: The dataset needed more features.
+- Convenient: Literally 5 lines of python.
+- Convenient: Testing different model performance was easy.
+- Archival: Lots of pretrained classification tasks.
+
+===
+
+# Data Structures
+Stories
+
+- Top level stories.
+ - title.
+ - publisher.
+ - author.
+- Related discussion.
+ - publisher.
+ - uses 'parent' story as a source.
+- Stream of stories (changes constantly).
+
+==
+
+# Data Structures
+Bias
+
+- Per publisher.
+ - name.
+ - label.
+ - agree/disagree vote by community.
+- Name could be semi-automatically joined to stories.
+
+==
+
+# Data Structures
+Embeddings
+
+- Per story title.
+ - sentence embedding (n, 384).
+ - sentiment classification (n, 1).
+ - emotional classification (n, 1).
+- ~ 1 hour of inference time to map story titles and descriptions.
+
+===
+
+# Data Collection
+
+==
+
+# Data Collection
+
+Story Scraper (simplified)
+
+```python
+day = timedelta(days=1)
+cur = date(2005, 10, 1)
+end = date.today()
+while cur <= end:
+ cur = cur + day
+ save_as = output_dir / f"{cur.strftime('%y-%m-%d')}.html"
+ url = f"https://www.memeorandum.com/{cur.strftime('%y%m%d')}/h2000"
+ r = requests.get(url)
+ with open(save_as, 'w') as f:
+ f.write(r.text)
+```
+
+==
+
+# Data Collection
+Bias Scraper (hard)
+
+```python
+...
+bias_html = DATA_DIR / 'allsides.html'
+parser = etree.HTMLParser()
+tree = etree.parse(str(bias_html), parser)
+root = tree.getroot()
+rows = root.xpath('//table[contains(@class,"views-table")]/tbody/tr')
+
+ratings = []
+for row in rows:
+ rating = dict()
+ ...
+```
+
+==
+
+# Data Collection
+Bias Scraper (easy)
+
+![allsides request](https://studentweb.cs.wwu.edu/~jensen33/static/577/allsides_request.png)
+
+==
+
+# Data Collection
+Embeddings (easy)
+
+```python
+# table = ...
+tokenizer = AutoTokenizer.from_pretrained("roberta-base")
+model = AutoModel.from_pretrained("roberta-base")
+
+for chunk in table:
+ tokens = tokenizer(chunk, add_special_tokens = True, truncation = True, padding = "max_length", max_length=92, return_attention_mask = True, return_tensors = "pt")
+ outputs = model(**tokens)
+ embeddings = outputs.last_hidden_state.detach().numpy()
+ ...
+```
+
+==
+
+# Data Collection
+Classification Embeddings (medium)
+
+```python
+...
+outputs = model(**tokens)[0].detach().numpy()
+scores = 1 / (1 + np.exp(-outputs)) # Sigmoid
+class_ids = np.argmax(scores, axis=1)
+for i, class_id in enumerate(class_ids):
+ results.append({"story_id": ids[i], "label" : model.config.id2label[class_id]})
+...
+```
+
+===
+
+# Data Selection
+
+==
+
+# Data Selection
+Stories
+
+- Clip the first and last full year of stories.
+- Remove duplicate stories (big stories span multiple days).
+
+==
+# Data Selection
+
+Publishers
+
+- Combine subdomains of stories.
+ - blog.washingtonpost.com and washingtonpost.com are considered the same publisher.
+ - This could be bad. For example: opinion.wsj.com != wsj.com.
+
+==
+
+# Data Selection
+
+Links
+
+- Select only stories with publishers whose story had been a 'parent' ('original publishers').
+ - Eliminates small blogs and non-original news.
+- Eliminate publishers without links to original publishers.
+ - Eliminate silo'ed publications.
+ - Link matrix is square and low'ish dimensional.
+
+==
+
+# Data Selection
+
+Bias
+
+- Keep all ratings, even ones with low agree/disagree ratio.
+- Join datasets on publisher name.
+ - Not automatic (look up Named Entity Recognition).
+ - Started with 'jaro winkler similarity' then manually from there.
+- Use numeric values
+ - [left: -2, left-center: -1, ...]
+
+===
+
+# Descriptive Stats
+
+Raw
+
+| metric | value |
+|:------------------|--------:|
+| total stories | 299714 |
+| total related | 960111 |
+| publishers | 7031 |
+| authors | 34346 |
+| max year | 2023 |
+| min year | 2005 |
+| top level domains | 7063 |
+
+==
+# Descriptive Stats
+
+Stories Per Publisher
+
+![stories per publisher](/static/577/stories_per_publisher.png)
+
+==
+
+# Descriptive Stats
+
+Top Publishers
+
+![top publishers](https://studentweb.cs.wwu.edu/~jensen33/static/577/top_publishers.png)
+
+==
+
+# Descriptive Stats
+
+Articles Per Year
+
+![articles per year](https://studentweb.cs.wwu.edu/~jensen33/static/577/articles_per_year.png)
+
+==
+
+# Descriptive Stats
+
+Common TLDs
+
+![common tlds](https://studentweb.cs.wwu.edu/~jensen33/static/577/common_tld.png)
+
+==
+
+# Descriptive Stats
+
+Post Process
+
+| key | value |
+|:------------------|--------:|
+| total stories | 251553 |
+| total related | 815183 |
+| publishers | 223 |
+| authors | 23809 |
+| max year | 2022 |
+| min year | 2006 |
+| top level domains | 234 |
+
+===
+# Experiments
+
+1. **clustering** on link similarity.
+2. **classification** on link similarity.
+3. **classification** on sentence embedding.
+4. **classification** on sentiment analysis.
+5. **regression** on emotional classification over time and publication.
+
+===
+# Experiment 1
+
+Setup
+
+- Create one-hot encoding of links between publishers.
+- Cluster the encoding.
+- Expect similar publications in same cluster.
+- Use PCA to visualize clusters.
+
+Note:
+Principle Component Analysis:
+- a statistical technique for reducing the dimensionality of a dataset.
+- linear transformation into a new coordinate system where (most of) the variation data can be described with fewer dimensions than the initial data.
+
+==
+
+# Experiment 1
+
+One Hot Encoding
+
+| publisher | nytimes| wsj| newsweek| ...|
+|:----------|--------:|----:|--------:|----:|
+| nytimes | 1| 1| 1| ...|
+| wsj | 1| 1| 0| ...|
+| newsweek | 0| 0| 1| ...|
+| ... | ...| ...| ...| ...|
+
+==
+
+# Experiment 1
+
+n-Hot Encoding
+
+| publisher | nytimes| wsj| newsweek| ...|
+|:----------|--------:|----:|--------:|----:|
+| nytimes | 11| 1| 141| ...|
+| wsj | 1| 31| 0| ...|
+| newsweek | 0| 0| 1| ...|
+| ... | ...| ...| ...| ...|
+
+==
+
+# Experiment 1
+
+Normalized n-Hot Encoding
+
+| publisher | nytimes| wsj| newsweek| ...|
+|:----------|--------:|----:|--------:|----:|
+| nytimes | 0| 0.4| 0.2| ...|
+| wsj | 0.2| 0| 0.4| ...|
+| newsweek | 0.0| 0.0| 0.0| ...|
+| ... | ...| ...| ...| ...|
+
+==
+
+# Experiment 1
+
+Elbow criterion
+
+![elbow](https://studentweb.cs.wwu.edu/~jensen33/static/577/link_cluster_elbow.png)
+
+Note:
+
+The elbow method looks at the percentage of explained variance as a function of the number of clusters:
+
+One should choose a number of clusters so that adding another cluster doesn't give much better modeling of the data.
+
+Percentage of variance explained is the ratio of the between-group variance to the total variance,
+
+==
+
+# Experiment 1
+
+Link Magnitude
+
+![link magnitude cluster](https://studentweb.cs.wwu.edu/~jensen33/static/577/link_pca_clusters_links.png)
+
+==
+
+# Experiment 1
+
+Normalized
+
+![link normalized cluster](https://studentweb.cs.wwu.edu/~jensen33/static/577/link_pca_clusters_normalized.png)
+
+==
+
+# Experiment 1
+
+Onehot
+
+![link onehot cluster](https://studentweb.cs.wwu.edu/~jensen33/static/577/link_pca_clusters_onehot.png)
+
+==
+
+# Experiment 1
+
+Discussion
+
+- Best encoding: One hot.
+ - Clusters based on total links otherwise.
+- Clusters, but no explanation
+- Limitation: need the link encoding to cluster.
+ - Smaller publishers might not link very much.
+
+===
+
+# Experiment 2
+
+Setup
+
+- Create features. :
+ - Publisher frequency.
+ - Reuse link encodings.
+- Create classes:
+ - Join bias classifications.
+- Train classifier.
+
+Note:
+
+==
+# Experiment 2
+Descriptive stats
+
+| metric | value |
+|:------------|:----------|
+| publishers | 1582 |
+| labels | 6 |
+| left | 482 |
+| center | 711 |
+| right | 369 |
+| agree range | [0.0-1.0] |
+
+==
+
+# Experiment 2
+
+PCA + Labels
+
+![pca vs. bias labels](https://studentweb.cs.wwu.edu/~jensen33/static/577/pca_with_classes.png)
+
+==
+
+# Experiment 2
+
+Discussion
+
+- Link encodings (and their PCA) are useful.
+ - Labels are (sort of) separated and clustered.
+ - Creating them for smaller publishers is trivial.
+==
+
+# Experiment 2
+
+Limitations
+
+- Dependent on accurate rating.
+- Ordinal ratings not available.
+- Dependent on accurate joining across datasets.
+- Entire publication is rated, not authors.
+- Don't know what to do with community rating.
+
+===
+
+# Experiment 3
+
+Setup
+
+==
+
+# Limitations
+
+- Many different authors under the same publisher.
+- Publishers use syndication.
+- Bias ratings are biased.
+
+===
+
+# Questions
+
+===
+
+
+
+# References
+
+[1]: Stewart, A.J. et al. 2020. Polarization under rising inequality and economic decline. Science Advances. 6, 50 (Dec. 2020), eabd4201. DOI:https://doi.org/10.1126/sciadv.abd4201.
+
+Note:
diff --git a/src/bias.py b/src/bias.py
index 5347aa7..92f952a 100644
--- a/src/bias.py
+++ b/src/bias.py
@@ -1,12 +1,12 @@
import click
-from data import connect
+from data.main import connect
import pandas as pd
from lxml import etree
from pathlib import Path
import os
import csv
-def map(rating:str) -> int:
+def label_to_int(rating:str) -> int:
mapping = {
'left' : 0,
@@ -19,20 +19,18 @@ def map(rating:str) -> int:
return mapping[rating]
+def int_to_label(class_id: int) -> str:
+ mapping = {
+ 0 : 'left',
+ 1 : 'left-center',
+ 2 : 'center',
+ 3 : 'right-center',
+ 4 : 'right',
+ -1 : 'allsides',
+ }
+ return mapping[class_id]
-@click.command(name="bias:load")
-def load() -> None:
- DB = connect()
- DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
- f = str(DATA_DIR / "bias_ratings.csv")
- DB.sql(f"""
- create table bias_ratings as
- select
- row_number() over(order by b.publisher) as id
- ,b.*
- from read_csv_auto('{f}') b
- """)
@click.command(name="bias:normalize")
def normalize() -> None:
DB = connect()
@@ -41,133 +39,48 @@ def normalize() -> None:
CREATE OR REPLACE TABLE publisher_bias AS
WITH cte AS (
SELECT
- p.id
+ p.id as publisher_id
+ ,b.id as bias_id
,b.bias as label
,JARO_WINKLER_SIMILARITY(LOWER(p.name), LOWER(b.publisher)) as similarity
FROM bias_ratings b
- JOIN publishers p
+ JOIN top.publishers p
ON JARO_WINKLER_SIMILARITY(LOWER(p.name), LOWER(b.publisher)) > 0.95
),ranked AS (
SELECT
- id
+ publisher_id
+ ,bias_id
,label
,similarity
- ,ROW_NUMBER() OVER(PARTITION BY id ORDER BY similarity DESC) AS rn
+ ,ROW_NUMBER() OVER(PARTITION BY publisher_id ORDER BY similarity DESC) AS rn
FROM cte
)
SELECT
- id
+ publisher_id
,label
+ ,bias_id
FROM ranked
WHERE ranked.rn = 1
""")
+ mapping = [
+ {'label' :'left' , 'ordinal': -2},
+ {'label' :'left-center' , 'ordinal': -1},
+ {'label' :'center' , 'ordinal': 0},
+ {'label' :'right-center' , 'ordinal': 1},
+ {'label' :'right' , 'ordinal': 2},
+ ]
+ mapping = pd.DataFrame(mapping)
- DB.sql("""
- with cte as (
- select
- s.publisher_id
- ,count(1) as stories
- from stories s
- group by s.publisher_id
- )
- select
- s.publisher
- ,s.stories
- ,b.publisher
- ,b.bias
- from bias_ratings b
- join cte s
- on s.publisher = b.publisher
- order by
- stories desc
- limit 15
+ DB.query("alter table bias_ratings add column ordinal int")
+
+ DB.query("""
+ update bias_ratings b
+ set ordinal = o.ordinal
+ FROM mapping o
+ WHERE o.label = b.bias
""")
- DB.sql("""
- with cte as (
- select
- s.publisher
- ,count(1) as stories
- from stories s
- group by s.publisher
- )
- select
- sum(stories)
- ,avg(agree / disagree)
- from bias_ratings b
- join cte s
- on s.publisher = b.publisher
- """)
-
- DB.sql("""
- with cte as (
- select
- s.publisher
- ,count(1) as stories
- from stories s
- group by s.publisher
- )
- select
- sum(s.stories) filter(where b.publisher is not null) as matched
- ,sum(s.stories) filter(where b.publisher is null) as unmatched
- ,cast(sum(s.stories) filter(where b.publisher is not null) as numeric)
- / sum(s.stories) filter(where b.publisher is null) as precent_matched
- from bias_ratings b
- right join cte s
- on s.publisher = b.publisher
- """)
-
- DB.sql("""
- select
- *
- from bias_ratings
- where publisher ilike '%CNN%'
- """)
-
-@click.command(name='bias:debug')
-def debug() -> None:
- DB = connect()
- DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
- f = str(DATA_DIR / "bias_ratings.csv")
-
- DB.sql("""
- with cte as (
- select
- outlet
- ,count(1) as stories
- from stories
- group by outlet
- )
- ,total as (
- select
- sum(stories) as total
- from cte
- )
- select
- cte.outlet
- ,cte.stories
- ,bias.outlet
- ,bias.lean
- ,sum(100 * (cte.stories / cast(total.total as float))) over() as rep
- ,total.total
- from cte
- join bias
- on jaro_winkler_similarity(bias.outlet, cte.outlet) > 0.9
- cross join total.total
- """)
-
- DB.sql("""
- select
- outlet
- ,count(1) as stories
- from stories
- group by outlet
- order by count(1) desc
- limit 50
- """)
-
- outlets
@click.command(name='bias:parse')
def parse() -> None:
@@ -199,3 +112,64 @@ def parse() -> None:
ratings.append(rating)
df = pd.DataFrame(ratings)
df.to_csv(DATA_DIR / 'bias_ratings.csv', sep="|", index=False, quoting=csv.QUOTE_NONNUMERIC)
+
+@click.command(name="bias:load")
+def load() -> None:
+ DB = connect()
+ DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
+ f = str(DATA_DIR / "bias_ratings.csv")
+
+ DB.sql(f"""
+ CREATE TABLE bias_ratings as
+ select
+ row_number() over(order by b.publisher) as id
+ ,b.*
+ from read_csv_auto('{f}') b
+ """)
+
+@click.command('bias:export')
+def export():
+ data_path = Path(os.environ['DATA_MINING_DATA_DIR'])
+
+ DB = connect()
+ all_bias = DB.query("""
+ SELECT
+ id as bias_id
+ ,publisher as name
+ ,bias as label
+ FROM bias_ratings
+ ORDER by agree desc
+ """)
+ all_bias.df().to_csv(data_path / 'TMP_publisher_bias.csv', sep="|", index=False)
+ mapped_bias = DB.query("""
+ SELECT
+ p.id as publisher_id
+ ,p.name as name
+ ,p.tld as tld
+ ,b.label as bias
+ ,b.bias_id as bias_id
+ FROM top.publishers p
+ LEFT JOIN publisher_bias b
+ ON b.publisher_id = p.id
+ """)
+ mapped_bias.df().to_csv(data_path / 'TMP_publisher_bias_to_load.csv', sep="|", index=False)
+ DB.close()
+
+@click.command('bias:import-mapped')
+def import_mapped():
+ data_path = Path(os.environ['DATA_MINING_DATA_DIR'])
+ table_name = "top.publisher_bias"
+
+ DB = connect()
+ df = pd.read_csv(data_path / 'TMP_publisher_bias_to_load.csv', sep="|")
+
+ DB.query(f"""
+ CREATE OR REPLACE TABLE {table_name} AS
+ SELECT
+ publisher_id AS publisher_id
+ ,cast(bias_id AS int) as bias_id
+ FROM df
+ WHERE bias_id IS NOT NULL
+ """)
+ print(f"created table: {table_name}")
+
diff --git a/src/cli.py b/src/cli.py
index d91e0e2..11744d3 100644
--- a/src/cli.py
+++ b/src/cli.py
@@ -7,7 +7,7 @@ def cli():
if __name__ == "__main__":
load_dotenv()
- import scrape
+ from data import scrape
cli.add_command(scrape.download)
cli.add_command(scrape.parse)
cli.add_command(scrape.load)
@@ -32,4 +32,26 @@ if __name__ == "__main__":
cli.add_command(emotion.create_table)
import sentence
cli.add_command(sentence.embed)
+ from train import main as train_main
+ cli.add_command(train_main.main)
+
+ import plots.descriptive as plotd
+ cli.add_command(plotd.articles_per_year)
+ cli.add_command(plotd.distinct_publishers)
+ cli.add_command(plotd.stories_per_publisher)
+ cli.add_command(plotd.top_publishers)
+ cli.add_command(plotd.common_tld)
+
+ import links as linkcli
+ cli.add_command(linkcli.create_table)
+ cli.add_command(linkcli.create_pca)
+ cli.add_command(linkcli.create_clusters)
+
+ import plots.links as plotl
+ cli.add_command(plotl.elbow)
+ cli.add_command(plotl.link_pca_clusters)
+
+ import plots.classifier as plotc
+ cli.add_command(plotc.pca_with_classes)
+
cli()
diff --git a/src/data/__init__.py b/src/data/__init__.py
new file mode 100644
index 0000000..0c64a7f
--- /dev/null
+++ b/src/data/__init__.py
@@ -0,0 +1,6 @@
+import data.main
+import data.scrape
+__all__ = [
+ 'main'
+ ,'scrape'
+]
diff --git a/src/data.py b/src/data/main.py
similarity index 100%
rename from src/data.py
rename to src/data/main.py
diff --git a/src/scrape.py b/src/data/scrape.py
similarity index 73%
rename from src/scrape.py
rename to src/data/scrape.py
index f3a285c..13377ba 100644
--- a/src/scrape.py
+++ b/src/data/scrape.py
@@ -4,10 +4,12 @@ import requests
from pathlib import Path
import click
from tqdm import tqdm
-from data import data_dir, connect
+from data.main import data_dir, connect
from lxml import etree
import pandas as pd
from urllib.parse import urlparse
+from tld import get_tld
+from tld.utils import update_tld_names
@click.command(name='scrape:load')
@click.option('--directory', type=Path, default=data_dir(), show_default=True)
@@ -61,6 +63,7 @@ def download(output_dir):
@click.option('-o', '--output_dir', type=Path, default=data_dir(), show_default=True)
def parse(directory, output_dir):
"""parse the html files on disk into a structured csv format."""
+ update_tld_names()
directory = data_dir() / "memeorandum"
parser = etree.HTMLParser()
pages = [f for f in directory.glob("*.html")]
@@ -104,8 +107,7 @@ def parse(directory, output_dir):
url = item.xpath('.//strong/a')[0].get('href')
out['url'] = url
- out['publisher_url_domain'] = urlparse(publisher_url).netloc
- out['domain'] = urlparse(url).netloc
+ out['tld'] = get_tld(publisher_url)
item_id = hash((page.stem, url))
out['id'] = item_id
@@ -225,3 +227,111 @@ def normalize():
alter table related_stories drop publisher_domain;
""")
+
+def another_norm():
+ sv2 = pd.read_csv(data_dir / 'stories.csv', sep="|")
+ related = pd.read_csv(data_dir / 'related.csv', sep="|")
+
+ related['tld'] = related.url.apply(lambda x: map_tld(x))
+
+ DB.query("""
+ update related_stories
+ set publisher_id = p.id
+ from publishers p
+ join related r
+ on r.tld = p.tld
+ where r.url = related_stories.url
+ """)
+
+
+ DB.query("""alter table stories add column tld text""")
+
+ s_url = DB.query("""
+ select
+ id
+ ,url
+ from stories
+ """).df()
+
+
+ s_url['tld'] = s_url.url.apply(lambda x: map_tld(x))
+
+ DB.query("""
+ update stories
+ set tld = s_url.tld
+ from s_url
+ where s_url.id = stories.id
+ """)
+
+ DB.query("""
+ update stories
+ set publisher_id = p.id
+ from publishers p
+ where p.tld = stories.tld
+ """)
+
+
+ select
+ DB.query("""
+ update stories
+ set stories.publisher_id = p.id
+ from new_pub
+ """)
+ sv2['tld'] = sv2.publisher_url.apply(lambda x: map_tld(x))
+
+
+ new_pub = DB.query("""
+ with cte as (
+ select
+ tld
+ ,publisher
+ ,count(1) filter(where year(published_at) = 2022) as recent_ctn
+ ,count(1) as ctn
+ from sv2
+ group by
+ tld
+ ,publisher
+ )
+ ,r as (
+ select
+ tld
+ ,publisher
+ ,ctn
+ ,row_number() over(partition by tld order by recent_ctn desc) as rn
+ from cte
+ )
+ select
+ row_number() over() as id
+ ,publisher as name
+ ,tld
+ from r
+ where rn = 1
+ order by ctn desc
+ """).df()
+
+ DB.query("""
+ CREATE OR REPLACE TABLE publishers AS
+ SELECT
+ id
+ ,name
+ ,tld
+ FROM new_pub
+ """)
+
+
+ def map_tld(x):
+ try:
+ res = get_tld(x, as_object=True)
+ return res.fld
+ except:
+ return None
+
+ DB.sql("""
+ SELECT
+ s.id
+ ,sv2.publisher_url
+ FROM stories s
+ JOIN sv2
+ on sv2.id = s.id
+ limit 5
+ """)
diff --git a/src/emotion.py b/src/emotion.py
index 7ef23e6..b7e4824 100644
--- a/src/emotion.py
+++ b/src/emotion.py
@@ -6,7 +6,7 @@ import numpy as np
from transformers import BertTokenizer
from model import BertForMultiLabelClassification
-from data import connect, data_dir
+from data.main import connect, data_dir
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
@@ -376,3 +376,99 @@ def debug():
DB.close()
out.to_csv(data_dir() / 'emotions.csv', sep="|")
+
+def another():
+ DB = connect()
+ DB.sql("""
+ select
+ *
+ from emotions
+ """)
+
+ emotions = DB.sql("""
+ select
+ year(s.published_at) as year
+ ,se.label as emotion
+ ,count(1) as stories
+ from stories s
+ join story_emotions se
+ on s.id = se.story_id
+ group by
+ year(s.published_at)
+ ,se.label
+ """).df()
+
+ sns.scatterplot(x=emotions['year'], y=emotions['stories'], hue=emotions['emotion'])
+ plt.show()
+
+ pivot = emotions.pivot(index='year', columns='emotion', values='stories')
+ pivot.reset_index(inplace=True)
+ from sklearn.linear_model import LinearRegression
+ reg = LinearRegression()
+
+ for emotion in pivot.keys()[1:].tolist():
+ _ = reg.fit(pivot['year'].to_numpy().reshape(-1, 1), pivot[emotion])
+ print(f"{emotion}: {reg.coef_[0]}")
+
+ fig, ax = plt.subplots()
+ #sns.lineplot(x=pivot['anger'], y=pivot['joy'])
+ #sns.lineplot(x=pivot['anger'], y=pivot['surprise'], ax=ax)
+ sns.lineplot(x=pivot['anger'], y=pivot['fear'], ax=ax)
+ sns.lineplot(x=pivot[''], y=pivot['fear'], ax=ax)
+ plt.show()
+
+ DB.close()
+
+ normalized = DB.sql("""
+ with cte as (
+ select
+ year(s.published_at) as year
+ ,se.label as emotion
+ ,b.label as bias
+ from stories s
+ join story_emotions se
+ on s.id = se.story_id
+ join publisher_bias b
+ on b.id = s.publisher_id
+ where b.label != 'allsides'
+ and se.label != 'neutral'
+ )
+ select
+ distinct
+ year
+ ,emotion
+ ,bias
+ ,cast(count(1) over(partition by year, bias, emotion) as float) / count(1) over(partition by year, bias) as group_count
+ from cte
+ """).df()
+
+ DB.sql("""
+ select
+ b.label as bias
+ ,count(1) as stories
+ from stories s
+ join story_emotions se
+ on s.id = se.story_id
+ join publisher_bias b
+ on b.id = s.publisher_id
+ group by
+ b.label
+ """).df()
+
+ another_pivot = emotional_bias.pivot(index=['bias', 'year'], columns='emotion', values='stories')
+ another_pivot.reset_index(inplace=True)
+
+ sns.lineplot(data=normalized, x='year', y='group_count', hue='bias', style='emotion')
+ plt.show()
+
+ sns.relplot(
+ data=normalized, x="year", y="group_count", hue="emotion", col='bias', kind="line"
+ #data=normalized, x="year", y="group_count", hue="emotion", col='bias', kind="line", facet_kws=dict(sharey=False)
+ )
+ plt.show()
+
+ DB.sql("""
+ select
+ *
+ from another_pivot
+ """)
diff --git a/src/lib.py b/src/lib.py
deleted file mode 100644
index deef1b2..0000000
--- a/src/lib.py
+++ /dev/null
@@ -1,8 +0,0 @@
-import sklearn
-import polars as pl
-import toml
-from pathlib import Path
-
-config = toml.load('/home/user/577/repo/config.toml')
-app_dir = Path(config.get('app').get('path'))
-df = pl.read_csv(app_dir / "data/articles.csv")
diff --git a/src/links.py b/src/links.py
index 93e7cfa..9dc2a56 100644
--- a/src/links.py
+++ b/src/links.py
@@ -1,12 +1,148 @@
-from data import connect
+import click
+from data.main import connect
import pandas as pd
import numpy as np
-from sklearn.decomposition import PCA, TruncatedSVD
-from sklearn.cluster import MiniBatchKMeans
import seaborn as sns
import matplotlib.pyplot as plt
+@click.command('links:create-table')
+def create_table():
+
+ table_name = "top.link_edges"
+ DB = connect()
+ DB.query(f"""
+ CREATE OR REPLACE TABLE {table_name} AS
+ with cte as(
+ SELECT
+ s.publisher_id as parent_id
+ ,r.publisher_id as child_id
+ ,count(1) as links
+ FROM top.stories s
+ JOIN top.related_stories r
+ ON s.id = r.parent_id
+ group by
+ s.publisher_id
+ ,r.publisher_id
+ )
+ SELECT
+ cte.parent_id
+ ,cte.child_id
+ ,cte.links as links
+ ,cast(cte.links as float) / sum(cte.links) over(partition by cte.parent_id) as normalized
+ ,case when cte.links > 0 then 1 else 0 end as onehot
+ FROM cte
+ WHERE cte.child_id in (
+ SELECT
+ distinct parent_id
+ FROM cte
+ )
+ AND cte.parent_id in (
+ SELECT
+ distinct child_id
+ FROM cte
+ )
+ """)
+ DB.close()
+
+ DB = connect()
+ DB.query("""
+ SELECT
+ *
+ ,-log10(links)
+ --distinct parent_id
+ FROM top.link_edges e
+ WHERE e.parent_id = 238
+ """)
+ DB.close()
+ print(f"created {table_name}")
+
+@click.command('links:create-pca')
+@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
+def create_pca(source):
+ """create 2D pca labels"""
+
+ from sklearn.decomposition import PCA
+
+ table_name = f"top.publisher_pca_{source}"
+ DB = connect()
+ pub = DB.query("""
+ SELECT
+ *
+ FROM top.publishers
+ """).df()
+ df = DB.query(f"""
+ SELECT
+ parent_id
+ ,child_id
+ ,{source} as links
+ FROM top.link_edges
+ """).df()
+ DB.close()
+ pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
+
+ svd = PCA(n_components=2)
+ svd_out = svd.fit_transform(pivot)
+
+ out = pivot.reset_index()[['parent_id']]
+ out['first'] = svd_out[:, 0]
+ out['second'] = svd_out[:, 1]
+ out = pd.merge(out, pub, left_on='parent_id', right_on='id')
+
+ DB = connect()
+ DB.query(f"""
+ CREATE OR REPLACE TABLE {table_name} AS
+ SELECT
+ out.id as publisher_id
+ ,out.first as first
+ ,out.second as second
+ FROM out
+ """)
+ DB.close()
+ print(f"created {table_name}")
+
+
+@click.command('links:create-clusters')
+@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
+def create_clusters(source):
+ from sklearn.cluster import KMeans
+
+ table_name = f"top.publisher_clusters_{source}"
+ DB = connect()
+ df = DB.query(f"""
+ SELECT
+ parent_id
+ ,child_id
+ ,{source} as links
+ FROM top.link_edges
+ """).df()
+ pub = DB.query("""
+ SELECT
+ *
+ FROM top.publishers
+ """).df()
+ DB.close()
+ pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
+
+
+ k = 8
+ kmeans = KMeans(n_clusters=k, n_init="auto")
+ pred = kmeans.fit_predict(pivot)
+ out = pivot.reset_index()[['parent_id']]
+ out['label'] = pred
+ out = pd.merge(out, pub, left_on='parent_id', right_on='id')
+ new_table = out[['id', 'label']]
+
+ DB = connect()
+ DB.query(f"""
+ CREATE OR REPLACE TABLE {table_name} AS
+ SELECT
+ n.id as publisher_id
+ ,n.label as label
+ FROM new_table n
+ """)
+ DB.close()
+ print(f"created {table_name}")
def to_matrix():
"""returns an adjacency matrix of publishers to publisher link frequency"""
@@ -21,6 +157,7 @@ def to_matrix():
{'label' :'right', 'value' : 4},
{'label' :'allsides', 'value' : -1},
])
+
bias = DB.sql("""
SELECT
b.id
@@ -37,11 +174,7 @@ def to_matrix():
p.id
,p.name
,p.url
- ,b.label
- ,b.value
from publishers p
- left join bias b
- on b.id = p.id
""").df()
edges = DB.sql("""
@@ -81,12 +214,23 @@ def to_matrix():
ON p.id = cte.parent_id
""").df()
+ # only keep values that have more than 1 link
+ test = edges[edges['links'] > 2].pivot(index='parent_id', columns='child_id', values='links').fillna(0).reset_index()
+ edges.dropna().pivot(index='parent_id', columns='child_id', values='links').fillna(0)
+ pd.merge(adj, pub, how='left', left_on='parent_id', right_on='id')
adj = edges.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
+ adj.values.shape
out = pd.DataFrame(adj.index.values, columns=['id'])
out = pd.merge(out, pub, how='left', on='id')
+ return out
+@click.command('links:analysis')
+def analysis():
+ from sklearn.decomposition import PCA, TruncatedSVD
+ from sklearn.cluster import MiniBatchKMeans
+ adj = to_matrix()
pca = PCA(n_components=4)
pca_out = pca.fit_transform(adj)
diff --git a/src/mine.py b/src/mine.py
index d7f1d29..8c2108b 100644
--- a/src/mine.py
+++ b/src/mine.py
@@ -1,4 +1,4 @@
-from data import data_dir, connect
+from data.main import data_dir, connect
import numpy as np
import sklearn
from sklearn.cluster import MiniBatchKMeans
diff --git a/src/plots/__init__.py b/src/plots/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/plots/classifier.py b/src/plots/classifier.py
new file mode 100644
index 0000000..c85aa7d
--- /dev/null
+++ b/src/plots/classifier.py
@@ -0,0 +1,34 @@
+import click
+from data.main import connect
+import os
+import seaborn as sns
+import matplotlib.pyplot as plt
+from pathlib import Path
+
+out_dir = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
+
+@click.command('plot:pca-with-classes')
+def pca_with_classes():
+ filename = "pca_with_classes.png"
+
+ DB = connect()
+ data = DB.query(f"""
+ SELECT
+ p.tld
+ ,b.bias
+ ,c.first
+ ,c.second
+ ,round(cast(b.agree as float) / (b.agree + b.disagree), 2) ratio
+ FROM top.publishers p
+ JOIN top.publisher_bias pb
+ ON p.id = pb.publisher_id
+ JOIN bias_ratings b
+ ON b.id = pb.bias_id
+ JOIN top.publisher_pca_normalized c
+ ON c.publisher_id = p.id
+ """).df()
+ DB.close()
+ ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['bias'], s=100)
+ ax.set(title="pca components vs. bias labels", xlabel="first pca component", ylabel="second pca component")
+ plt.savefig(out_dir / filename)
+ print(f"saved: {filename}")
diff --git a/src/plots/descriptive.py b/src/plots/descriptive.py
new file mode 100644
index 0000000..24cf25b
--- /dev/null
+++ b/src/plots/descriptive.py
@@ -0,0 +1,302 @@
+import click
+from data.main import connect
+import os
+import seaborn as sns
+import matplotlib.pyplot as plt
+from pathlib import Path
+import numpy as np
+
+out_dir = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
+
+@click.command('plot:articles-per-year')
+def articles_per_year():
+ filename = 'articles_per_year.png'
+
+ DB = connect()
+ data = DB.query("""
+ select
+ year(published_at) as year
+ ,count(1) as stories
+ from stories
+ group by
+ year(published_at)
+ """).df()
+ DB.close()
+
+ ax = sns.barplot(x=data.year, y=data.stories, color='tab:blue')
+ ax.tick_params(axis='x', rotation=90)
+ ax.set(title="count of articles per year", ylabel="count of stories (#)")
+ plt.tight_layout()
+ plt.savefig(out_dir / filename)
+
+@click.command('plot:distinct-publishers')
+def distinct_publishers():
+ filename = 'distinct_publishers.png'
+
+ DB = connect()
+ data = DB.query("""
+ select
+ year(published_at) as year
+ ,count(distinct publisher_id) as publishers
+ from stories
+ group by
+ year(published_at)
+ """).df()
+ DB.close()
+
+ ax = sns.barplot(x=data.year, y=data.publishers, color='tab:blue')
+ ax.tick_params(axis='x', rotation=90)
+ ax.set(title="count of publishers per year", ylabel="count of publishers (#)")
+ plt.tight_layout()
+ plt.savefig(out_dir / filename)
+ plt.close()
+
+@click.command('plot:stories-per-publisher')
+def stories_per_publisher():
+ filename = 'stories_per_publisher.png'
+
+ DB = connect()
+ data = DB.query("""
+ with cte as (
+ select
+ publisher_id
+ ,year(published_at) as year
+ ,count(1) as stories
+ from stories
+ group by
+ publisher_id
+ ,year(published_at)
+ ) , agg as (
+ select
+ publisher_id
+ ,avg(stories) as stories_per_year
+ ,case
+ when avg(stories) < 2 then 2
+ when avg(stories) < 4 then 4
+ when avg(stories) < 8 then 8
+ when avg(stories) < 16 then 16
+ when avg(stories) < 32 then 32
+ when avg(stories) < 64 then 64
+ when avg(stories) < 128 then 128
+ else 129
+ end as max_avg
+ from cte
+ group by
+ publisher_id
+ )
+ select
+ max_avg
+ ,count(1) as publishers
+ from agg
+ group by
+ max_avg
+ """).df()
+ DB.close()
+
+ ax = sns.barplot(x=data.max_avg, y=data.publishers, color='tab:blue')
+ ax.set(title="histogram of publisher stories per year", ylabel="count of publishers (#)", xlabel="max average stories / year")
+ plt.tight_layout()
+ plt.savefig(out_dir / filename)
+ plt.close()
+
+
+@click.command('plot:top-publishers')
+def top_publishers():
+ """plot top publishers over time"""
+
+ filename = 'top_publishers.png'
+
+ DB = connect()
+ data = DB.query("""
+ select
+ p.tld
+ ,year(published_at) as year
+ ,count(1) as stories
+ from (
+ select
+ p.tld
+ ,p.id
+ from top.publishers p
+ join top.stories s
+ on s.publisher_id = p.id
+ group by
+ p.tld
+ ,p.id
+ order by count(1) desc
+ limit 20
+ ) p
+ join top.stories s
+ on s.publisher_id = p.id
+ group by
+ p.tld
+ ,year(published_at)
+ order by count(distinct s.id) desc
+ """).df()
+ DB.close()
+
+ pivot = data.pivot(columns='year', index='tld', values='stories')
+ ax = sns.heatmap(pivot, cmap="crest")
+ ax.set(title="top 20 publishers (by tld)", ylabel="tld", xlabel="stories / year (#)")
+ plt.tight_layout()
+ plt.savefig(out_dir / filename)
+ plt.close()
+
+
+@click.command('plot:common_tld')
+def common_tld():
+ import dataframe_image as dfi
+ filename = 'common_tld.png'
+
+ DB = connect()
+ data = DB.query("""
+ select
+ split_part(url, '.', -1) as tld
+ ,count(1) as publishers
+ ,case when count(1) < 20
+ then string_agg(distinct url, '\t')
+ else NULL
+ end as urls
+ from publishers
+ group by
+ split_part(url, '.', -1)
+ order by
+ count(1) desc
+ """).df()
+ DB.close()
+ data[:15][['tld', 'publishers']].style.hide(axis="index").export_png(out_dir / filename, table_conversion='matplotlib')
+
+def stats():
+
+ # raw
+ DB.query("""
+ SELECT
+ 'total stories' as key
+ ,COUNT(1) as value
+ FROM stories
+ UNION
+ SELECT
+ 'total related' as key
+ ,COUNT(1) as value
+ FROM related_stories
+ UNION
+ SELECT
+ 'top level domains' as key
+ ,COUNT(distinct tld) as value
+ FROM stories
+ UNION
+ SELECT
+ 'publishers' as key
+ ,COUNT(1) as value
+ FROM publishers
+ UNION
+ SELECT
+ 'authors' as key
+ ,COUNT(distinct author) as value
+ FROM stories
+ UNION
+ SELECT
+ 'min year' as key
+ ,min(year(published_at)) as value
+ FROM stories
+ UNION
+ SELECT
+ 'max year' as key
+ ,max(year(published_at)) as value
+ FROM stories
+ """).df().to_markdown(index=False)
+
+ # selected
+ DB.query("""
+ SELECT
+ 'total stories' as key
+ ,COUNT(1) as value
+ FROM top.stories
+ UNION
+ SELECT
+ 'total related' as key
+ ,COUNT(1) as value
+ FROM top.related_stories
+ UNION
+ SELECT
+ 'top level domains' as key
+ ,COUNT(distinct tld) as value
+ FROM top.stories
+ UNION
+ SELECT
+ 'publishers' as key
+ ,COUNT(1) as value
+ FROM top.publishers
+ UNION
+ SELECT
+ 'authors' as key
+ ,COUNT(distinct author) as value
+ FROM top.stories
+ UNION
+ SELECT
+ 'min year' as key
+ ,min(year(published_at)) as value
+ FROM top.stories
+ UNION
+ SELECT
+ 'max year' as key
+ ,max(year(published_at)) as value
+ FROM top.stories
+ """).df().to_markdown(index=False)
+
+@click.command('plot:bias-stats')
+def bias_stats():
+ import dataframe_image as dfi
+ filename = 'bias_stats.png'
+
+ DB = connect()
+
+ df = DB.query("""
+ SELECT
+ string_agg(distinct bias)
+ FROM bias_ratings
+ """).df()
+
+ df.keys()
+
+ df = DB.query("""
+ SELECT
+ 'publishers' as metric
+ ,count(1) as value
+ FROM bias_ratings
+ UNION
+ SELECT
+ 'labels' as metric
+ ,count(distinct bias) as value
+ FROM bias_ratings
+ UNION
+ SELECT
+ 'right' as metric
+ ,count(1) as value
+ FROM bias_ratings
+ WHERE bias in ('right', 'right-center')
+ UNION
+ SELECT
+ 'left' as metric
+ ,count(1) as value
+ FROM bias_ratings
+ WHERE bias in ('left', 'left-center')
+ UNION
+ SELECT
+ 'center' as metric
+ ,count(1) as value
+ FROM bias_ratings
+ WHERE bias in ('center')
+ UNION
+ SELECT
+ 'agree_range' as metric
+ ,'['
+ || min(cast(agree as float) / (agree + disagree))
+ || '-'
+ || max(cast(agree as float) / (agree + disagree))
+ || ']'
+ as value
+ FROM bias_ratings
+ WHERE bias in ('center')
+ """).df()
+ DB.close()
+ print(df.to_markdown(index=False))
diff --git a/src/plots/links.py b/src/plots/links.py
new file mode 100644
index 0000000..6526142
--- /dev/null
+++ b/src/plots/links.py
@@ -0,0 +1,114 @@
+import click
+from data.main import connect
+from links import to_matrix
+import os
+import seaborn as sns
+import matplotlib.pyplot as plt
+from pathlib import Path
+import numpy as np
+from sklearn.metrics import silhouette_score
+import pandas as pd
+
+out_dir = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
+
+@click.command('plot:link-elbow')
+def elbow():
+ from sklearn.cluster import KMeans
+
+ filename = 'link_cluster_elbow.png'
+
+ DB = connect()
+ df = DB.query("""
+ SELECT
+ *
+ FROM link_edges
+ """).df()
+ pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
+
+ to_plot = []
+ for k in range(2, 15):
+ kmeans = KMeans(n_clusters=k, n_init="auto")
+ kmeans.fit(pivot)
+ label = kmeans.labels_
+ coeff = silhouette_score(pivot, label, metric='euclidean')
+ to_plot.append({'k': k, 'inertia' : kmeans.inertia_, 'coeff': coeff})
+ to_plot = pd.DataFrame(to_plot)
+
+ ax = sns.lineplot(x=to_plot.k, y=to_plot.inertia)
+ ax.set(title="elbow criterion plot of clusters", xlabel="bin size (k)", ylabel="sum of squared distances between centroids/points")
+ plt.savefig(out_dir / filename)
+ plt.close()
+
+ # randomly pick 8
+
+@click.command('plot:link-pca-clusters')
+@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
+def link_pca_clusters(source):
+
+ filename = f"link_pca_clusters_{source}.png"
+
+ DB = connect()
+ df = DB.query(f"""
+ SELECT
+ c.label as cluster
+ ,p.tld
+ --,b.label as bias
+ ,pca.first
+ ,pca.second
+ ,s.cnt as stories
+ FROM top.publisher_clusters_{source} c
+ JOIN top.publishers p
+ ON c.publisher_id = p.id
+ JOIN
+ (
+ select
+ s.publisher_id
+ ,count(1) as cnt
+ FROM top.stories s
+ GROUP BY
+ s.publisher_id
+ ) s
+ ON s.publisher_id = p.id
+ JOIN top.publisher_pca_{source} pca
+ ON pca.publisher_id = p.id
+ """).df()
+ DB.close()
+
+ ax = sns.scatterplot(x=df['first'], y=df['second'], hue=df['cluster'])
+ ax.set(title=f"pca components vs. clusters ({source})", xlabel="first pca component", ylabel="second pca component")
+ plt.savefig(out_dir / filename)
+
+ # .df().groupby(['cluster', 'bias']).describe()
+
+
+
+
+def test():
+ data_dir = Path(os.getenv('DATA_MINING_DATA_DIR'))
+
+ DB.query("""
+ SELECT
+ p.id as publisher_id
+ ,p.name
+ ,p.tld
+ ,cast(b.bias_id as int) as bias_id
+ ,count(1) as stories
+ FROM publishers p
+ JOIN stories s
+ ON s.publisher_id = p.id
+ JOIN publisher_clusters c
+ ON c.publisher_id = p.id
+ LEFT JOIN publisher_bias b
+ ON b.publisher_id = p.id
+ where bias_id is null
+ group by
+ p.id
+ ,p.name
+ ,p.tld
+ ,b.bias_id
+ ORDER BY count(1) desc
+ """)
+
+ # .df().to_csv(data_dir / 'cluster_publishers.csv', sep="|", index=False)
+ DB.close()
+
diff --git a/src/selection.py b/src/selection.py
new file mode 100644
index 0000000..9c34543
--- /dev/null
+++ b/src/selection.py
@@ -0,0 +1,48 @@
+from data.main import connect
+import pandas as pd
+import numpy as np
+
+DB = connect()
+edges = DB.query("""
+ select
+ *
+ from link_edges
+""").df()
+DB.close()
+
+edges
+
+adj = edges.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
+select_publishers = pd.DataFrame(adj.index.tolist(), columns=['publisher_id'])
+
+DB = connect()
+DB.query("create schema top")
+
+DB.query("""
+ CREATE OR REPLACE TABLE top.publishers AS
+ SELECT
+ p.*
+ FROM publishers p
+ JOIN select_publishers s
+ ON s.publisher_id = p.id
+""")
+
+DB.query("""
+ CREATE OR REPLACE TABLE top.stories AS
+ SELECT
+ s.*
+ FROM stories s
+ JOIN top.publishers p
+ ON s.publisher_id = p.id
+ WHERE year(s.published_at) >= 2006
+ AND year(s.published_at) < 2023
+""")
+
+DB.query("""
+ CREATE OR REPLACE TABLE top.related_stories AS
+ SELECT
+ r.*
+ FROM top.stories s
+ JOIN related_stories r
+ ON s.id = r.parent_id
+""")
diff --git a/src/sentence.py b/src/sentence.py
new file mode 100644
index 0000000..4bf5014
--- /dev/null
+++ b/src/sentence.py
@@ -0,0 +1,138 @@
+from transformers import AutoTokenizer, AutoModel
+import torch
+import torch.nn.functional as F
+from data.main import connect, data_dir
+import os
+from pathlib import Path
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+import click
+
+#Mean Pooling - Take attention mask into account for correct averaging
+def mean_pooling(model_output, attention_mask):
+ token_embeddings = model_output[0] #First element of model_output contains all token embeddings
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+
+@click.option('-c', '--chunks', type=int, default=500, show_default=True)
+@click.command("sentence:embed")
+def embed(chunks):
+
+ # Load model from HuggingFace Hub
+ tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
+ model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
+
+ # load data
+ DB = connect()
+ table = DB.sql("""
+ select
+ id
+ ,title
+ from stories
+ order by id desc
+ """).df()
+ DB.close()
+
+ # normalize text
+ table['title'] = table['title'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
+
+
+ chunked = np.array_split(table, chunks)
+ # generate embeddings from list of titles
+ iterator = tqdm(chunked, 'embedding')
+ embeddings = []
+ embedding_ids = []
+ for _, chunk in enumerate(iterator):
+ sentences = chunk['title'].tolist()
+ ids = chunk['id'].tolist()
+ # Tokenize sentences
+ encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
+ # Compute token embeddings
+ with torch.no_grad():
+ model_output = model(**encoded_input)
+ # Perform pooling
+ output = mean_pooling(model_output, encoded_input['attention_mask'])
+ # Normalize embeddings
+ output = F.normalize(output, p=2, dim=1)
+ embeddings.append(output)
+ embedding_ids.append(ids)
+
+ embeddings = np.concatenate(embeddings)
+ ids = np.concatenate(embedding_ids)
+
+ # save embeddings
+ save_to = data_dir() / 'embeddings.npy'
+ np.save(save_to, embeddings)
+ print(f"embeddings saved: {save_to}")
+
+ # save ids
+ save_to = data_dir() / 'embedding_ids.npy'
+ np.save(save_to, ids)
+ print(f"ids saved: {save_to}")
+
+
+@click.command('sentence:create-pca-table')
+def create_table():
+ from sklearn import linear_model
+ data_path = Path(os.getenv('DATA_MINING_DATA_DIR'))
+
+ embeddings = np.load(data_path / 'embeddings.npy')
+ embedding_ids = np.load(data_path / 'embedding_ids.npy')
+ ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
+ DB = connect()
+
+ data = DB.query("""
+ SELECT
+ ids.index
+ ,s.id
+ ,b.ordinal
+ FROM ids
+ JOIN top.stories s
+ ON ids.story_id = s.id
+ JOIN top.publisher_bias pb
+ ON pb.publisher_id = s.publisher_id
+ JOIN bias_ratings b
+ ON b.id = pb.bias_id
+ """).df()
+
+ x = embeddings[data['index']]
+ y = data['ordinal'].to_numpy().reshape(-1, 1)
+
+ reg = linear_model.LinearRegression()
+
+ reg.fit(x, y)
+
+ reg.coef_.shape
+
+@click.command('sentence:create-svm-table')
+def create_svm_table():
+ from sklearn import svm
+ data_path = Path(os.getenv('DATA_MINING_DATA_DIR'))
+
+ embeddings = np.load(data_path / 'embeddings.npy')
+ embedding_ids = np.load(data_path / 'embedding_ids.npy')
+ ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
+
+ DB = connect()
+ data = DB.query("""
+ SELECT
+ ids.index
+ ,s.id
+ ,b.ordinal
+ FROM ids
+ JOIN top.stories s
+ ON ids.story_id = s.id
+ JOIN top.publisher_bias pb
+ ON pb.publisher_id = s.publisher_id
+ JOIN bias_ratings b
+ ON b.id = pb.bias_id
+ """).df()
+
+ x = embeddings[data['index']]
+ #y = data['ordinal'].to_numpy().reshape(-1, 1)
+ y = data['ordinal']
+
+ clf = svm.SVC()
+ pred = clf.fit(x, y)
+
diff --git a/src/train/__init__.py b/src/train/__init__.py
new file mode 100644
index 0000000..4519cec
--- /dev/null
+++ b/src/train/__init__.py
@@ -0,0 +1,5 @@
+import train.main
+
+__all__ = [
+ 'main'
+]
diff --git a/src/train/dataset.py b/src/train/dataset.py
new file mode 100644
index 0000000..15bfd47
--- /dev/null
+++ b/src/train/dataset.py
@@ -0,0 +1,38 @@
+from torch.utils.data import Dataset
+from data.main import connect, data_dir
+from bias import label_to_int
+import numpy as np
+import pandas as pd
+
+class NewsDataset(Dataset):
+ def __init__(self):
+ self.embeddings = np.load(data_dir() / 'embeddings.npy')
+ embedding_ids = pd.DataFrame(np.load(data_dir() / 'embedding_ids.npy'), columns=['id']).reset_index()
+
+ DB = connect()
+ query = """
+ SELECT
+ s.id
+ ,b.label
+ ,count(1) over (partition by publisher_id) as stories
+ FROM stories s
+ JOIN publisher_bias b
+ ON b.id = s.publisher_id
+ WHERE b.label != 'allsides'
+ """
+ data = DB.sql(query).df()
+ DB.close()
+
+ data['label'] = data['label'].apply(lambda x: label_to_int(x))
+ data = data.merge(embedding_ids)
+ self.data = data
+
+ def __len__(self):
+ return len(self.data)
+
+ def __getitem__(self, idx):
+ row = self.data.iloc[idx]
+ y = row['label']
+ # x = np.concatenate((self.embeddings[row['index']], [row['stories']])).astype(np.float32)
+ x = self.embeddings[row['index']]
+ return x, y
diff --git a/src/train/main.py b/src/train/main.py
new file mode 100644
index 0000000..dbad152
--- /dev/null
+++ b/src/train/main.py
@@ -0,0 +1,132 @@
+import click
+from tqdm import tqdm
+from enum import Enum, auto
+from dotenv import load_dotenv
+import os
+
+import torch
+from torch import nn
+from torch import optim
+from torch.utils.data import DataLoader
+from accelerate import Accelerator
+
+from train.dataset import NewsDataset
+from train.model import Classifier
+#from model.linear import LinearClassifier
+
+class Stage(Enum):
+ TRAIN = auto()
+ DEV = auto()
+
+@click.command('train:main')
+def main():
+ dev_after = 20
+ visible_devices = None
+ lr = 1e-4
+ epochs = 10
+ debug = False
+ torch.manual_seed(0)
+ num_workers = 0
+
+ embedding_length = int(os.getenv('EMBEDDING_LENGTH', 384))
+
+ dataset = NewsDataset()
+ trainset, devset = torch.utils.data.random_split(dataset, [0.8, 0.2])
+ batch_size = 512
+ trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=num_workers, drop_last=True)
+ devloader = DataLoader(devset, shuffle=False, num_workers=num_workers)
+ accelerator = Accelerator()
+ model = Classifier(embedding_length=embedding_length, classes=5)
+
+ # it's possible to control which GPUs the process can see using an environmental variable
+ if visible_devices:
+ os.environ['CUDA_VISIBLE_DEVICES'] = visible_devices
+ if debug:
+ os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
+ #accelerator.log({"message" :"debug enabled"})
+
+ criterion = torch.nn.CrossEntropyLoss()
+ optimizer = torch.optim.Adam(model.parameters(), lr=lr)
+
+ # wrap objects with accelerate
+ model, optimizer, trainloader, devloader = accelerator.prepare(model, optimizer, trainloader, devloader)
+ def run():
+ """runner for training and valdating"""
+ running_loss = 0.0
+ # set the model to train model
+ model.train() if stage == Stage.TRAIN else model.eval()
+ dataloader = trainloader if stage == Stage.TRAIN else devloader
+ desc = 'train epoch' if stage == Stage.TRAIN else 'dev epoch'
+ if debug:
+ ...
+
+ # Make sure there are no leftover gradients before starting training an epoch
+ optimizer.zero_grad()
+
+ for batch, (x, y) in enumerate(tqdm(dataloader, desc=desc)):
+ pred_y = model(x) # Forward pass through model
+ loss = criterion(pred_y, y)
+ running_loss += loss # Increment running loss
+ # Only update model weights on training
+ if stage == Stage.TRAIN:
+ accelerator.backward(loss) # Increment gradients within model by sending loss backwards
+ optimizer.step() # Update model weights
+ optimizer.zero_grad() # Reset gradients to 0
+ return running_loss / len(dataloader)
+
+
+ for epoch in range(epochs):
+ if (epoch - 1) % dev_after == 0:
+ stage = Stage.DEV
+ log = run()
+ print(f"dev loss: {log}")
+ else:
+ stage = Stage.TRAIN
+ log = run()
+ print(f"train loss: {log}")
+
+ breakpoint()
+ from data.main import data_dir, connect
+ import numpy as np
+ import pandas as pd
+ from bias import int_to_label
+
+ embeddings = dataset.embeddings
+ embedding_ids = dataset.data
+
+ DB = connect()
+ query = """
+ SELECT
+ s.id
+ ,title
+ ,p.name
+ ,count(1) over (partition by publisher_id) as stories
+ FROM stories s
+ JOIN publishers p
+ on p.id = s.publisher_id
+ WHERE s.publisher_id NOT IN (
+ SELECT
+ id
+ FROM publisher_bias b
+ )
+ """
+ data = DB.sql(query).df()
+ embeddings = np.load(data_dir() / 'embeddings.npy')
+ embedding_ids = pd.DataFrame(np.load(data_dir() / 'embedding_ids.npy'), columns=['id']).reset_index()
+
+
+ for i in range(10):
+ embedding = embeddings[embedding_ids[embedding_ids['id'] == data.iloc[i]['id']]['index']]
+ title = data.iloc[i]['title']
+ publisher = data.iloc[i]['name']
+ class_pred = nn.functional.softmax( model(torch.tensor(embedding))).detach()
+ class_id = int(torch.argmax(nn.functional.softmax( model(torch.tensor(embedding))).detach()))
+ print(f"{publisher}: {int_to_label(class_id)} - \"{title}\"")
+
+ embedding_ids['id'] == data.iloc[0]['id']
+ embedding_ids[embedding_ids['id'] == data.iloc[0]['id']]
+ embedding = embeddings[embedding_ids[embedding_ids['id'] == data.iloc[0]['id']]['index']]
+ title
+ publisher
+
+ model().get_last_layer(torch.tensor(embedding))
diff --git a/src/train/model.py b/src/train/model.py
new file mode 100644
index 0000000..b9faed4
--- /dev/null
+++ b/src/train/model.py
@@ -0,0 +1,28 @@
+from torch import nn
+
+class Classifier(nn.Module):
+ def __init__(self, embedding_length: int, classes: int):
+ super().__init__()
+ out_len = 16
+ self.stack = nn.Sequential(
+ nn.Linear(embedding_length, 256),
+ nn.ReLU(),
+ nn.Linear(256, 256),
+ nn.ReLU(),
+ nn.Linear(256, 64),
+ nn.ReLU(),
+ nn.Linear(64, 64),
+ nn.ReLU(),
+ nn.Linear(64, out_len),
+ nn.ReLU(),
+ )
+ self.logits = nn.Linear(out_len, classes)
+
+ def forward(self, x):
+ x = self.stack(x)
+ self.last_hidden_layer = x.detach()
+ return self.logits(x)
+
+ def get_last_layer(self, x):
+ x = self.stack(x)
+ return x
diff --git a/src/word.py b/src/word.py
index dc408d5..36f5244 100644
--- a/src/word.py
+++ b/src/word.py
@@ -1,7 +1,7 @@
import click
from transformers import AutoTokenizer, RobertaModel
import numpy as np
-from data import Data, from_db, connect, data_dir
+from data.main import Data, from_db, connect, data_dir
from tqdm import tqdm
import torch
from pathlib import Path